LLVM  10.0.0svn
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements basic block placement transformations using the CFG
10 // structure and branch probability estimates.
11 //
12 // The pass strives to preserve the structure of the CFG (that is, retain
13 // a topological ordering of basic blocks) in the absence of a *strong* signal
14 // to the contrary from probabilities. However, within the CFG structure, it
15 // attempts to choose an ordering which favors placing more likely sequences of
16 // blocks adjacent to each other.
17 //
18 // The algorithm works from the inner-most loop within a function outward, and
19 // at each stage walks through the basic blocks, trying to coalesce them into
20 // sequential chains where allowed by the CFG (or demanded by heavy
21 // probabilities). Finally, it walks the blocks in topological order, and the
22 // first time it reaches a chain of basic blocks, it schedules them in the
23 // function in-order.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "BranchFolding.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
49 #include "llvm/IR/DebugLoc.h"
50 #include "llvm/IR/Function.h"
51 #include "llvm/Pass.h"
52 #include "llvm/Support/Allocator.h"
55 #include "llvm/Support/CodeGen.h"
57 #include "llvm/Support/Compiler.h"
58 #include "llvm/Support/Debug.h"
61 #include <algorithm>
62 #include <cassert>
63 #include <cstdint>
64 #include <iterator>
65 #include <memory>
66 #include <string>
67 #include <tuple>
68 #include <utility>
69 #include <vector>
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "block-placement"
74 
75 STATISTIC(NumCondBranches, "Number of conditional branches");
76 STATISTIC(NumUncondBranches, "Number of unconditional branches");
77 STATISTIC(CondBranchTakenFreq,
78  "Potential frequency of taking conditional branches");
79 STATISTIC(UncondBranchTakenFreq,
80  "Potential frequency of taking unconditional branches");
81 
82 static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
83  cl::desc("Force the alignment of all "
84  "blocks in the function."),
85  cl::init(0), cl::Hidden);
86 
88  "align-all-nofallthru-blocks",
89  cl::desc("Force the alignment of all "
90  "blocks that have no fall-through predecessors (i.e. don't add "
91  "nops that are executed)."),
92  cl::init(0), cl::Hidden);
93 
94 // FIXME: Find a good default for this flag and remove the flag.
96  "block-placement-exit-block-bias",
97  cl::desc("Block frequency percentage a loop exit block needs "
98  "over the original exit to be considered the new exit."),
99  cl::init(0), cl::Hidden);
100 
101 // Definition:
102 // - Outlining: placement of a basic block outside the chain or hot path.
103 
105  "loop-to-cold-block-ratio",
106  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
107  "(frequency of block) is greater than this ratio"),
108  cl::init(5), cl::Hidden);
109 
111  "force-loop-cold-block",
112  cl::desc("Force outlining cold blocks from loops."),
113  cl::init(false), cl::Hidden);
114 
115 static cl::opt<bool>
116  PreciseRotationCost("precise-rotation-cost",
117  cl::desc("Model the cost of loop rotation more "
118  "precisely by using profile data."),
119  cl::init(false), cl::Hidden);
120 
121 static cl::opt<bool>
122  ForcePreciseRotationCost("force-precise-rotation-cost",
123  cl::desc("Force the use of precise cost "
124  "loop rotation strategy."),
125  cl::init(false), cl::Hidden);
126 
128  "misfetch-cost",
129  cl::desc("Cost that models the probabilistic risk of an instruction "
130  "misfetch due to a jump comparing to falling through, whose cost "
131  "is zero."),
132  cl::init(1), cl::Hidden);
133 
134 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
135  cl::desc("Cost of jump instructions."),
136  cl::init(1), cl::Hidden);
137 static cl::opt<bool>
138 TailDupPlacement("tail-dup-placement",
139  cl::desc("Perform tail duplication during placement. "
140  "Creates more fallthrough opportunites in "
141  "outline branches."),
142  cl::init(true), cl::Hidden);
143 
144 static cl::opt<bool>
145 BranchFoldPlacement("branch-fold-placement",
146  cl::desc("Perform branch folding during placement. "
147  "Reduces code size."),
148  cl::init(true), cl::Hidden);
149 
150 // Heuristic for tail duplication.
152  "tail-dup-placement-threshold",
153  cl::desc("Instruction cutoff for tail duplication during layout. "
154  "Tail merging during layout is forced to have a threshold "
155  "that won't conflict."), cl::init(2),
156  cl::Hidden);
157 
158 // Heuristic for aggressive tail duplication.
160  "tail-dup-placement-aggressive-threshold",
161  cl::desc("Instruction cutoff for aggressive tail duplication during "
162  "layout. Used at -O3. Tail merging during layout is forced to "
163  "have a threshold that won't conflict."), cl::init(4),
164  cl::Hidden);
165 
166 // Heuristic for tail duplication.
168  "tail-dup-placement-penalty",
169  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
170  "Copying can increase fallthrough, but it also increases icache "
171  "pressure. This parameter controls the penalty to account for that. "
172  "Percent as integer."),
173  cl::init(2),
174  cl::Hidden);
175 
176 // Heuristic for triangle chains.
178  "triangle-chain-count",
179  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
180  "triangle tail duplication heuristic to kick in. 0 to disable."),
181  cl::init(2),
182  cl::Hidden);
183 
186 
187 // Internal option used to control BFI display only after MBP pass.
188 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
189 // -view-block-layout-with-bfi=
191 
192 // Command line option to specify the name of the function for CFG dump
193 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
195 
196 namespace {
197 
198 class BlockChain;
199 
200 /// Type for our function-wide basic block -> block chain mapping.
201 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
202 
203 /// A chain of blocks which will be laid out contiguously.
204 ///
205 /// This is the datastructure representing a chain of consecutive blocks that
206 /// are profitable to layout together in order to maximize fallthrough
207 /// probabilities and code locality. We also can use a block chain to represent
208 /// a sequence of basic blocks which have some external (correctness)
209 /// requirement for sequential layout.
210 ///
211 /// Chains can be built around a single basic block and can be merged to grow
212 /// them. They participate in a block-to-chain mapping, which is updated
213 /// automatically as chains are merged together.
214 class BlockChain {
215  /// The sequence of blocks belonging to this chain.
216  ///
217  /// This is the sequence of blocks for a particular chain. These will be laid
218  /// out in-order within the function.
220 
221  /// A handle to the function-wide basic block to block chain mapping.
222  ///
223  /// This is retained in each block chain to simplify the computation of child
224  /// block chains for SCC-formation and iteration. We store the edges to child
225  /// basic blocks, and map them back to their associated chains using this
226  /// structure.
227  BlockToChainMapType &BlockToChain;
228 
229 public:
230  /// Construct a new BlockChain.
231  ///
232  /// This builds a new block chain representing a single basic block in the
233  /// function. It also registers itself as the chain that block participates
234  /// in with the BlockToChain mapping.
235  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
236  : Blocks(1, BB), BlockToChain(BlockToChain) {
237  assert(BB && "Cannot create a chain with a null basic block");
238  BlockToChain[BB] = this;
239  }
240 
241  /// Iterator over blocks within the chain.
244 
245  /// Beginning of blocks within the chain.
246  iterator begin() { return Blocks.begin(); }
247  const_iterator begin() const { return Blocks.begin(); }
248 
249  /// End of blocks within the chain.
250  iterator end() { return Blocks.end(); }
251  const_iterator end() const { return Blocks.end(); }
252 
253  bool remove(MachineBasicBlock* BB) {
254  for(iterator i = begin(); i != end(); ++i) {
255  if (*i == BB) {
256  Blocks.erase(i);
257  return true;
258  }
259  }
260  return false;
261  }
262 
263  /// Merge a block chain into this one.
264  ///
265  /// This routine merges a block chain into this one. It takes care of forming
266  /// a contiguous sequence of basic blocks, updating the edge list, and
267  /// updating the block -> chain mapping. It does not free or tear down the
268  /// old chain, but the old chain's block list is no longer valid.
269  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
270  assert(BB && "Can't merge a null block.");
271  assert(!Blocks.empty() && "Can't merge into an empty chain.");
272 
273  // Fast path in case we don't have a chain already.
274  if (!Chain) {
275  assert(!BlockToChain[BB] &&
276  "Passed chain is null, but BB has entry in BlockToChain.");
277  Blocks.push_back(BB);
278  BlockToChain[BB] = this;
279  return;
280  }
281 
282  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
283  assert(Chain->begin() != Chain->end());
284 
285  // Update the incoming blocks to point to this chain, and add them to the
286  // chain structure.
287  for (MachineBasicBlock *ChainBB : *Chain) {
288  Blocks.push_back(ChainBB);
289  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
290  BlockToChain[ChainBB] = this;
291  }
292  }
293 
294 #ifndef NDEBUG
295  /// Dump the blocks in this chain.
296  LLVM_DUMP_METHOD void dump() {
297  for (MachineBasicBlock *MBB : *this)
298  MBB->dump();
299  }
300 #endif // NDEBUG
301 
302  /// Count of predecessors of any block within the chain which have not
303  /// yet been scheduled. In general, we will delay scheduling this chain
304  /// until those predecessors are scheduled (or we find a sufficiently good
305  /// reason to override this heuristic.) Note that when forming loop chains,
306  /// blocks outside the loop are ignored and treated as if they were already
307  /// scheduled.
308  ///
309  /// Note: This field is reinitialized multiple times - once for each loop,
310  /// and then once for the function as a whole.
311  unsigned UnscheduledPredecessors = 0;
312 };
313 
314 class MachineBlockPlacement : public MachineFunctionPass {
315  /// A type for a block filter set.
316  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
317 
318  /// Pair struct containing basic block and taildup profitability
319  struct BlockAndTailDupResult {
320  MachineBasicBlock *BB;
321  bool ShouldTailDup;
322  };
323 
324  /// Triple struct containing edge weight and the edge.
325  struct WeightedEdge {
326  BlockFrequency Weight;
327  MachineBasicBlock *Src;
328  MachineBasicBlock *Dest;
329  };
330 
331  /// work lists of blocks that are ready to be laid out
334 
335  /// Edges that have already been computed as optimal.
337 
338  /// Machine Function
340 
341  /// A handle to the branch probability pass.
342  const MachineBranchProbabilityInfo *MBPI;
343 
344  /// A handle to the function-wide block frequency pass.
345  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
346 
347  /// A handle to the loop info.
348  MachineLoopInfo *MLI;
349 
350  /// Preferred loop exit.
351  /// Member variable for convenience. It may be removed by duplication deep
352  /// in the call stack.
353  MachineBasicBlock *PreferredLoopExit;
354 
355  /// A handle to the target's instruction info.
356  const TargetInstrInfo *TII;
357 
358  /// A handle to the target's lowering info.
359  const TargetLoweringBase *TLI;
360 
361  /// A handle to the post dominator tree.
363 
364  /// Duplicator used to duplicate tails during placement.
365  ///
366  /// Placement decisions can open up new tail duplication opportunities, but
367  /// since tail duplication affects placement decisions of later blocks, it
368  /// must be done inline.
369  TailDuplicator TailDup;
370 
371  /// Allocator and owner of BlockChain structures.
372  ///
373  /// We build BlockChains lazily while processing the loop structure of
374  /// a function. To reduce malloc traffic, we allocate them using this
375  /// slab-like allocator, and destroy them after the pass completes. An
376  /// important guarantee is that this allocator produces stable pointers to
377  /// the chains.
379 
380  /// Function wide BasicBlock to BlockChain mapping.
381  ///
382  /// This mapping allows efficiently moving from any given basic block to the
383  /// BlockChain it participates in, if any. We use it to, among other things,
384  /// allow implicitly defining edges between chains as the existing edges
385  /// between basic blocks.
387 
388 #ifndef NDEBUG
389  /// The set of basic blocks that have terminators that cannot be fully
390  /// analyzed. These basic blocks cannot be re-ordered safely by
391  /// MachineBlockPlacement, and we must preserve physical layout of these
392  /// blocks and their successors through the pass.
393  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
394 #endif
395 
396  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
397  /// if the count goes to 0, add them to the appropriate work list.
398  void markChainSuccessors(
399  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
400  const BlockFilterSet *BlockFilter = nullptr);
401 
402  /// Decrease the UnscheduledPredecessors count for a single block, and
403  /// if the count goes to 0, add them to the appropriate work list.
404  void markBlockSuccessors(
405  const BlockChain &Chain, const MachineBasicBlock *BB,
406  const MachineBasicBlock *LoopHeaderBB,
407  const BlockFilterSet *BlockFilter = nullptr);
408 
410  collectViableSuccessors(
411  const MachineBasicBlock *BB, const BlockChain &Chain,
412  const BlockFilterSet *BlockFilter,
414  bool shouldPredBlockBeOutlined(
415  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
416  const BlockChain &Chain, const BlockFilterSet *BlockFilter,
417  BranchProbability SuccProb, BranchProbability HotProb);
418  bool repeatedlyTailDuplicateBlock(
420  const MachineBasicBlock *LoopHeaderBB,
421  BlockChain &Chain, BlockFilterSet *BlockFilter,
422  MachineFunction::iterator &PrevUnplacedBlockIt);
423  bool maybeTailDuplicateBlock(
425  BlockChain &Chain, BlockFilterSet *BlockFilter,
426  MachineFunction::iterator &PrevUnplacedBlockIt,
427  bool &DuplicatedToLPred);
428  bool hasBetterLayoutPredecessor(
429  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
430  const BlockChain &SuccChain, BranchProbability SuccProb,
431  BranchProbability RealSuccProb, const BlockChain &Chain,
432  const BlockFilterSet *BlockFilter);
433  BlockAndTailDupResult selectBestSuccessor(
434  const MachineBasicBlock *BB, const BlockChain &Chain,
435  const BlockFilterSet *BlockFilter);
436  MachineBasicBlock *selectBestCandidateBlock(
437  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
438  MachineBasicBlock *getFirstUnplacedBlock(
439  const BlockChain &PlacedChain,
440  MachineFunction::iterator &PrevUnplacedBlockIt,
441  const BlockFilterSet *BlockFilter);
442 
443  /// Add a basic block to the work list if it is appropriate.
444  ///
445  /// If the optional parameter BlockFilter is provided, only MBB
446  /// present in the set will be added to the worklist. If nullptr
447  /// is provided, no filtering occurs.
448  void fillWorkLists(const MachineBasicBlock *MBB,
449  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
450  const BlockFilterSet *BlockFilter);
451 
452  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
453  BlockFilterSet *BlockFilter = nullptr);
454  bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
455  const MachineBasicBlock *OldTop);
456  bool hasViableTopFallthrough(const MachineBasicBlock *Top,
457  const BlockFilterSet &LoopBlockSet);
458  BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
459  const BlockFilterSet &LoopBlockSet);
460  BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
461  const MachineBasicBlock *OldTop,
462  const MachineBasicBlock *ExitBB,
463  const BlockFilterSet &LoopBlockSet);
464  MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
465  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
466  MachineBasicBlock *findBestLoopTop(
467  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
468  MachineBasicBlock *findBestLoopExit(
469  const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
470  BlockFrequency &ExitFreq);
471  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
472  void buildLoopChains(const MachineLoop &L);
473  void rotateLoop(
474  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
475  BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
476  void rotateLoopWithProfile(
477  BlockChain &LoopChain, const MachineLoop &L,
478  const BlockFilterSet &LoopBlockSet);
479  void buildCFGChains();
480  void optimizeBranches();
481  void alignBlocks();
482  /// Returns true if a block should be tail-duplicated to increase fallthrough
483  /// opportunities.
484  bool shouldTailDuplicate(MachineBasicBlock *BB);
485  /// Check the edge frequencies to see if tail duplication will increase
486  /// fallthroughs.
487  bool isProfitableToTailDup(
488  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
489  BranchProbability QProb,
490  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
491 
492  /// Check for a trellis layout.
493  bool isTrellis(const MachineBasicBlock *BB,
494  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
495  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
496 
497  /// Get the best successor given a trellis layout.
498  BlockAndTailDupResult getBestTrellisSuccessor(
499  const MachineBasicBlock *BB,
500  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
501  BranchProbability AdjustedSumProb, const BlockChain &Chain,
502  const BlockFilterSet *BlockFilter);
503 
504  /// Get the best pair of non-conflicting edges.
505  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
506  const MachineBasicBlock *BB,
508 
509  /// Returns true if a block can tail duplicate into all unplaced
510  /// predecessors. Filters based on loop.
511  bool canTailDuplicateUnplacedPreds(
512  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
513  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
514 
515  /// Find chains of triangles to tail-duplicate where a global analysis works,
516  /// but a local analysis would not find them.
517  void precomputeTriangleChains();
518 
519 public:
520  static char ID; // Pass identification, replacement for typeid
521 
522  MachineBlockPlacement() : MachineFunctionPass(ID) {
524  }
525 
526  bool runOnMachineFunction(MachineFunction &F) override;
527 
528  bool allowTailDupPlacement() const {
529  assert(F);
531  }
532 
533  void getAnalysisUsage(AnalysisUsage &AU) const override {
536  if (TailDupPlacement)
541  }
542 };
543 
544 } // end anonymous namespace
545 
547 
549 
550 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
551  "Branch Probability Basic Block Placement", false, false)
556 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
557  "Branch Probability Basic Block Placement", false, false)
558 
559 #ifndef NDEBUG
560 /// Helper to print the name of a MBB.
561 ///
562 /// Only used by debug logging.
563 static std::string getBlockName(const MachineBasicBlock *BB) {
564  std::string Result;
565  raw_string_ostream OS(Result);
566  OS << printMBBReference(*BB);
567  OS << " ('" << BB->getName() << "')";
568  OS.flush();
569  return Result;
570 }
571 #endif
572 
573 /// Mark a chain's successors as having one fewer preds.
574 ///
575 /// When a chain is being merged into the "placed" chain, this routine will
576 /// quickly walk the successors of each block in the chain and mark them as
577 /// having one fewer active predecessor. It also adds any successors of this
578 /// chain which reach the zero-predecessor state to the appropriate worklist.
579 void MachineBlockPlacement::markChainSuccessors(
580  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
581  const BlockFilterSet *BlockFilter) {
582  // Walk all the blocks in this chain, marking their successors as having
583  // a predecessor placed.
584  for (MachineBasicBlock *MBB : Chain) {
585  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
586  }
587 }
588 
589 /// Mark a single block's successors as having one fewer preds.
590 ///
591 /// Under normal circumstances, this is only called by markChainSuccessors,
592 /// but if a block that was to be placed is completely tail-duplicated away,
593 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
594 /// for just that block.
595 void MachineBlockPlacement::markBlockSuccessors(
596  const BlockChain &Chain, const MachineBasicBlock *MBB,
597  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
598  // Add any successors for which this is the only un-placed in-loop
599  // predecessor to the worklist as a viable candidate for CFG-neutral
600  // placement. No subsequent placement of this block will violate the CFG
601  // shape, so we get to use heuristics to choose a favorable placement.
602  for (MachineBasicBlock *Succ : MBB->successors()) {
603  if (BlockFilter && !BlockFilter->count(Succ))
604  continue;
605  BlockChain &SuccChain = *BlockToChain[Succ];
606  // Disregard edges within a fixed chain, or edges to the loop header.
607  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
608  continue;
609 
610  // This is a cross-chain edge that is within the loop, so decrement the
611  // loop predecessor count of the destination chain.
612  if (SuccChain.UnscheduledPredecessors == 0 ||
613  --SuccChain.UnscheduledPredecessors > 0)
614  continue;
615 
616  auto *NewBB = *SuccChain.begin();
617  if (NewBB->isEHPad())
618  EHPadWorkList.push_back(NewBB);
619  else
620  BlockWorkList.push_back(NewBB);
621  }
622 }
623 
624 /// This helper function collects the set of successors of block
625 /// \p BB that are allowed to be its layout successors, and return
626 /// the total branch probability of edges from \p BB to those
627 /// blocks.
628 BranchProbability MachineBlockPlacement::collectViableSuccessors(
629  const MachineBasicBlock *BB, const BlockChain &Chain,
630  const BlockFilterSet *BlockFilter,
632  // Adjust edge probabilities by excluding edges pointing to blocks that is
633  // either not in BlockFilter or is already in the current chain. Consider the
634  // following CFG:
635  //
636  // --->A
637  // | / \
638  // | B C
639  // | \ / \
640  // ----D E
641  //
642  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
643  // A->C is chosen as a fall-through, D won't be selected as a successor of C
644  // due to CFG constraint (the probability of C->D is not greater than
645  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
646  // when calculating the probability of C->D, D will be selected and we
647  // will get A C D B as the layout of this loop.
648  auto AdjustedSumProb = BranchProbability::getOne();
649  for (MachineBasicBlock *Succ : BB->successors()) {
650  bool SkipSucc = false;
651  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
652  SkipSucc = true;
653  } else {
654  BlockChain *SuccChain = BlockToChain[Succ];
655  if (SuccChain == &Chain) {
656  SkipSucc = true;
657  } else if (Succ != *SuccChain->begin()) {
658  LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
659  << " -> Mid chain!\n");
660  continue;
661  }
662  }
663  if (SkipSucc)
664  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
665  else
666  Successors.push_back(Succ);
667  }
668 
669  return AdjustedSumProb;
670 }
671 
672 /// The helper function returns the branch probability that is adjusted
673 /// or normalized over the new total \p AdjustedSumProb.
674 static BranchProbability
676  BranchProbability AdjustedSumProb) {
677  BranchProbability SuccProb;
678  uint32_t SuccProbN = OrigProb.getNumerator();
679  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
680  if (SuccProbN >= SuccProbD)
681  SuccProb = BranchProbability::getOne();
682  else
683  SuccProb = BranchProbability(SuccProbN, SuccProbD);
684 
685  return SuccProb;
686 }
687 
688 /// Check if \p BB has exactly the successors in \p Successors.
689 static bool
692  if (BB.succ_size() != Successors.size())
693  return false;
694  // We don't want to count self-loops
695  if (Successors.count(&BB))
696  return false;
697  for (MachineBasicBlock *Succ : BB.successors())
698  if (!Successors.count(Succ))
699  return false;
700  return true;
701 }
702 
703 /// Check if a block should be tail duplicated to increase fallthrough
704 /// opportunities.
705 /// \p BB Block to check.
706 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
707  // Blocks with single successors don't create additional fallthrough
708  // opportunities. Don't duplicate them. TODO: When conditional exits are
709  // analyzable, allow them to be duplicated.
710  bool IsSimple = TailDup.isSimpleBB(BB);
711 
712  if (BB->succ_size() == 1)
713  return false;
714  return TailDup.shouldTailDuplicate(IsSimple, *BB);
715 }
716 
717 /// Compare 2 BlockFrequency's with a small penalty for \p A.
718 /// In order to be conservative, we apply a X% penalty to account for
719 /// increased icache pressure and static heuristics. For small frequencies
720 /// we use only the numerators to improve accuracy. For simplicity, we assume the
721 /// penalty is less than 100%
722 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
724  uint64_t EntryFreq) {
725  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
726  BlockFrequency Gain = A - B;
727  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
728 }
729 
730 /// Check the edge frequencies to see if tail duplication will increase
731 /// fallthroughs. It only makes sense to call this function when
732 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
733 /// always locally profitable if we would have picked \p Succ without
734 /// considering duplication.
735 bool MachineBlockPlacement::isProfitableToTailDup(
736  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
737  BranchProbability QProb,
738  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
739  // We need to do a probability calculation to make sure this is profitable.
740  // First: does succ have a successor that post-dominates? This affects the
741  // calculation. The 2 relevant cases are:
742  // BB BB
743  // | \Qout | \Qout
744  // P| C |P C
745  // = C' = C'
746  // | /Qin | /Qin
747  // | / | /
748  // Succ Succ
749  // / \ | \ V
750  // U/ =V |U \
751  // / \ = D
752  // D E | /
753  // | /
754  // |/
755  // PDom
756  // '=' : Branch taken for that CFG edge
757  // In the second case, Placing Succ while duplicating it into C prevents the
758  // fallthrough of Succ into either D or PDom, because they now have C as an
759  // unplaced predecessor
760 
761  // Start by figuring out which case we fall into
762  MachineBasicBlock *PDom = nullptr;
764  // Only scan the relevant successors
765  auto AdjustedSuccSumProb =
766  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
767  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
768  auto BBFreq = MBFI->getBlockFreq(BB);
769  auto SuccFreq = MBFI->getBlockFreq(Succ);
770  BlockFrequency P = BBFreq * PProb;
771  BlockFrequency Qout = BBFreq * QProb;
772  uint64_t EntryFreq = MBFI->getEntryFreq();
773  // If there are no more successors, it is profitable to copy, as it strictly
774  // increases fallthrough.
775  if (SuccSuccs.size() == 0)
776  return greaterWithBias(P, Qout, EntryFreq);
777 
778  auto BestSuccSucc = BranchProbability::getZero();
779  // Find the PDom or the best Succ if no PDom exists.
780  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
781  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
782  if (Prob > BestSuccSucc)
783  BestSuccSucc = Prob;
784  if (PDom == nullptr)
785  if (MPDT->dominates(SuccSucc, Succ)) {
786  PDom = SuccSucc;
787  break;
788  }
789  }
790  // For the comparisons, we need to know Succ's best incoming edge that isn't
791  // from BB.
792  auto SuccBestPred = BlockFrequency(0);
793  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
794  if (SuccPred == Succ || SuccPred == BB
795  || BlockToChain[SuccPred] == &Chain
796  || (BlockFilter && !BlockFilter->count(SuccPred)))
797  continue;
798  auto Freq = MBFI->getBlockFreq(SuccPred)
799  * MBPI->getEdgeProbability(SuccPred, Succ);
800  if (Freq > SuccBestPred)
801  SuccBestPred = Freq;
802  }
803  // Qin is Succ's best unplaced incoming edge that isn't BB
804  BlockFrequency Qin = SuccBestPred;
805  // If it doesn't have a post-dominating successor, here is the calculation:
806  // BB BB
807  // | \Qout | \
808  // P| C | =
809  // = C' | C
810  // | /Qin | |
811  // | / | C' (+Succ)
812  // Succ Succ /|
813  // / \ | \/ |
814  // U/ =V | == |
815  // / \ | / \|
816  // D E D E
817  // '=' : Branch taken for that CFG edge
818  // Cost in the first case is: P + V
819  // For this calculation, we always assume P > Qout. If Qout > P
820  // The result of this function will be ignored at the caller.
821  // Let F = SuccFreq - Qin
822  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
823 
824  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
825  BranchProbability UProb = BestSuccSucc;
826  BranchProbability VProb = AdjustedSuccSumProb - UProb;
827  BlockFrequency F = SuccFreq - Qin;
828  BlockFrequency V = SuccFreq * VProb;
829  BlockFrequency QinU = std::min(Qin, F) * UProb;
830  BlockFrequency BaseCost = P + V;
831  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
832  return greaterWithBias(BaseCost, DupCost, EntryFreq);
833  }
834  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
835  BranchProbability VProb = AdjustedSuccSumProb - UProb;
836  BlockFrequency U = SuccFreq * UProb;
837  BlockFrequency V = SuccFreq * VProb;
838  BlockFrequency F = SuccFreq - Qin;
839  // If there is a post-dominating successor, here is the calculation:
840  // BB BB BB BB
841  // | \Qout | \ | \Qout | \
842  // |P C | = |P C | =
843  // = C' |P C = C' |P C
844  // | /Qin | | | /Qin | |
845  // | / | C' (+Succ) | / | C' (+Succ)
846  // Succ Succ /| Succ Succ /|
847  // | \ V | \/ | | \ V | \/ |
848  // |U \ |U /\ =? |U = |U /\ |
849  // = D = = =?| | D | = =|
850  // | / |/ D | / |/ D
851  // | / | / | = | /
852  // |/ | / |/ | =
853  // Dom Dom Dom Dom
854  // '=' : Branch taken for that CFG edge
855  // The cost for taken branches in the first case is P + U
856  // Let F = SuccFreq - Qin
857  // The cost in the second case (assuming independence), given the layout:
858  // BB, Succ, (C+Succ), D, Dom or the layout:
859  // BB, Succ, D, Dom, (C+Succ)
860  // is Qout + max(F, Qin) * U + min(F, Qin)
861  // compare P + U vs Qout + P * U + Qin.
862  //
863  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
864  //
865  // For the 3rd case, the cost is P + 2 * V
866  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
867  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
868  if (UProb > AdjustedSuccSumProb / 2 &&
869  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
870  Chain, BlockFilter))
871  // Cases 3 & 4
872  return greaterWithBias(
873  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
874  EntryFreq);
875  // Cases 1 & 2
876  return greaterWithBias((P + U),
877  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
878  std::max(Qin, F) * UProb),
879  EntryFreq);
880 }
881 
882 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
883 /// successors form the lower part of a trellis. A successor set S forms the
884 /// lower part of a trellis if all of the predecessors of S are either in S or
885 /// have all of S as successors. We ignore trellises where BB doesn't have 2
886 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
887 /// are very uncommon and complex to compute optimally. Allowing edges within S
888 /// is not strictly a trellis, but the same algorithm works, so we allow it.
889 bool MachineBlockPlacement::isTrellis(
890  const MachineBasicBlock *BB,
891  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
892  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
893  // Technically BB could form a trellis with branching factor higher than 2.
894  // But that's extremely uncommon.
895  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
896  return false;
897 
899  BB->succ_end());
900  // To avoid reviewing the same predecessors twice.
902 
903  for (MachineBasicBlock *Succ : ViableSuccs) {
904  int PredCount = 0;
905  for (auto SuccPred : Succ->predecessors()) {
906  // Allow triangle successors, but don't count them.
907  if (Successors.count(SuccPred)) {
908  // Make sure that it is actually a triangle.
909  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
910  if (!Successors.count(CheckSucc))
911  return false;
912  continue;
913  }
914  const BlockChain *PredChain = BlockToChain[SuccPred];
915  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
916  PredChain == &Chain || PredChain == BlockToChain[Succ])
917  continue;
918  ++PredCount;
919  // Perform the successor check only once.
920  if (!SeenPreds.insert(SuccPred).second)
921  continue;
922  if (!hasSameSuccessors(*SuccPred, Successors))
923  return false;
924  }
925  // If one of the successors has only BB as a predecessor, it is not a
926  // trellis.
927  if (PredCount < 1)
928  return false;
929  }
930  return true;
931 }
932 
933 /// Pick the highest total weight pair of edges that can both be laid out.
934 /// The edges in \p Edges[0] are assumed to have a different destination than
935 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
936 /// the individual highest weight edges to the 2 different destinations, or in
937 /// case of a conflict, one of them should be replaced with a 2nd best edge.
938 std::pair<MachineBlockPlacement::WeightedEdge,
939  MachineBlockPlacement::WeightedEdge>
940 MachineBlockPlacement::getBestNonConflictingEdges(
941  const MachineBasicBlock *BB,
943  Edges) {
944  // Sort the edges, and then for each successor, find the best incoming
945  // predecessor. If the best incoming predecessors aren't the same,
946  // then that is clearly the best layout. If there is a conflict, one of the
947  // successors will have to fallthrough from the second best predecessor. We
948  // compare which combination is better overall.
949 
950  // Sort for highest frequency.
951  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
952 
953  llvm::stable_sort(Edges[0], Cmp);
954  llvm::stable_sort(Edges[1], Cmp);
955  auto BestA = Edges[0].begin();
956  auto BestB = Edges[1].begin();
957  // Arrange for the correct answer to be in BestA and BestB
958  // If the 2 best edges don't conflict, the answer is already there.
959  if (BestA->Src == BestB->Src) {
960  // Compare the total fallthrough of (Best + Second Best) for both pairs
961  auto SecondBestA = std::next(BestA);
962  auto SecondBestB = std::next(BestB);
963  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
964  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
965  if (BestAScore < BestBScore)
966  BestA = SecondBestA;
967  else
968  BestB = SecondBestB;
969  }
970  // Arrange for the BB edge to be in BestA if it exists.
971  if (BestB->Src == BB)
972  std::swap(BestA, BestB);
973  return std::make_pair(*BestA, *BestB);
974 }
975 
976 /// Get the best successor from \p BB based on \p BB being part of a trellis.
977 /// We only handle trellises with 2 successors, so the algorithm is
978 /// straightforward: Find the best pair of edges that don't conflict. We find
979 /// the best incoming edge for each successor in the trellis. If those conflict,
980 /// we consider which of them should be replaced with the second best.
981 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
982 /// comes from \p BB, it will be in \p BestEdges[0]
983 MachineBlockPlacement::BlockAndTailDupResult
984 MachineBlockPlacement::getBestTrellisSuccessor(
985  const MachineBasicBlock *BB,
986  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
987  BranchProbability AdjustedSumProb, const BlockChain &Chain,
988  const BlockFilterSet *BlockFilter) {
989 
990  BlockAndTailDupResult Result = {nullptr, false};
992  BB->succ_end());
993 
994  // We assume size 2 because it's common. For general n, we would have to do
995  // the Hungarian algorithm, but it's not worth the complexity because more
996  // than 2 successors is fairly uncommon, and a trellis even more so.
997  if (Successors.size() != 2 || ViableSuccs.size() != 2)
998  return Result;
999 
1000  // Collect the edge frequencies of all edges that form the trellis.
1002  int SuccIndex = 0;
1003  for (auto Succ : ViableSuccs) {
1004  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1005  // Skip any placed predecessors that are not BB
1006  if (SuccPred != BB)
1007  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1008  BlockToChain[SuccPred] == &Chain ||
1009  BlockToChain[SuccPred] == BlockToChain[Succ])
1010  continue;
1011  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1012  MBPI->getEdgeProbability(SuccPred, Succ);
1013  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1014  }
1015  ++SuccIndex;
1016  }
1017 
1018  // Pick the best combination of 2 edges from all the edges in the trellis.
1019  WeightedEdge BestA, BestB;
1020  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1021 
1022  if (BestA.Src != BB) {
1023  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1024  // we shouldn't choose any successor. We've already looked and there's a
1025  // better fallthrough edge for all the successors.
1026  LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1027  return Result;
1028  }
1029 
1030  // Did we pick the triangle edge? If tail-duplication is profitable, do
1031  // that instead. Otherwise merge the triangle edge now while we know it is
1032  // optimal.
1033  if (BestA.Dest == BestB.Src) {
1034  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1035  // would be better.
1036  MachineBasicBlock *Succ1 = BestA.Dest;
1037  MachineBasicBlock *Succ2 = BestB.Dest;
1038  // Check to see if tail-duplication would be profitable.
1039  if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1040  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1041  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1042  Chain, BlockFilter)) {
1044  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1045  dbgs() << " Selected: " << getBlockName(Succ2)
1046  << ", probability: " << Succ2Prob
1047  << " (Tail Duplicate)\n");
1048  Result.BB = Succ2;
1049  Result.ShouldTailDup = true;
1050  return Result;
1051  }
1052  }
1053  // We have already computed the optimal edge for the other side of the
1054  // trellis.
1055  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1056 
1057  auto TrellisSucc = BestA.Dest;
1059  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1060  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1061  << ", probability: " << SuccProb << " (Trellis)\n");
1062  Result.BB = TrellisSucc;
1063  return Result;
1064 }
1065 
1066 /// When the option allowTailDupPlacement() is on, this method checks if the
1067 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1068 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1069 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1070  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1071  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1072  if (!shouldTailDuplicate(Succ))
1073  return false;
1074 
1075  // For CFG checking.
1077  BB->succ_end());
1078  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1079  // Make sure all unplaced and unfiltered predecessors can be
1080  // tail-duplicated into.
1081  // Skip any blocks that are already placed or not in this loop.
1082  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1083  || BlockToChain[Pred] == &Chain)
1084  continue;
1085  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1086  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1087  // This will result in a trellis after tail duplication, so we don't
1088  // need to copy Succ into this predecessor. In the presence
1089  // of a trellis tail duplication can continue to be profitable.
1090  // For example:
1091  // A A
1092  // |\ |\
1093  // | \ | \
1094  // | C | C+BB
1095  // | / | |
1096  // |/ | |
1097  // BB => BB |
1098  // |\ |\/|
1099  // | \ |/\|
1100  // | D | D
1101  // | / | /
1102  // |/ |/
1103  // Succ Succ
1104  //
1105  // After BB was duplicated into C, the layout looks like the one on the
1106  // right. BB and C now have the same successors. When considering
1107  // whether Succ can be duplicated into all its unplaced predecessors, we
1108  // ignore C.
1109  // We can do this because C already has a profitable fallthrough, namely
1110  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1111  // duplication and for this test.
1112  //
1113  // This allows trellises to be laid out in 2 separate chains
1114  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1115  // because it allows the creation of 2 fallthrough paths with links
1116  // between them, and we correctly identify the best layout for these
1117  // CFGs. We want to extend trellises that the user created in addition
1118  // to trellises created by tail-duplication, so we just look for the
1119  // CFG.
1120  continue;
1121  return false;
1122  }
1123  }
1124  return true;
1125 }
1126 
1127 /// Find chains of triangles where we believe it would be profitable to
1128 /// tail-duplicate them all, but a local analysis would not find them.
1129 /// There are 3 ways this can be profitable:
1130 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1131 /// longer chains)
1132 /// 2) The chains are statically correlated. Branch probabilities have a very
1133 /// U-shaped distribution.
1134 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1135 /// If the branches in a chain are likely to be from the same side of the
1136 /// distribution as their predecessor, but are independent at runtime, this
1137 /// transformation is profitable. (Because the cost of being wrong is a small
1138 /// fixed cost, unlike the standard triangle layout where the cost of being
1139 /// wrong scales with the # of triangles.)
1140 /// 3) The chains are dynamically correlated. If the probability that a previous
1141 /// branch was taken positively influences whether the next branch will be
1142 /// taken
1143 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1144 void MachineBlockPlacement::precomputeTriangleChains() {
1145  struct TriangleChain {
1146  std::vector<MachineBasicBlock *> Edges;
1147 
1148  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1149  : Edges({src, dst}) {}
1150 
1151  void append(MachineBasicBlock *dst) {
1152  assert(getKey()->isSuccessor(dst) &&
1153  "Attempting to append a block that is not a successor.");
1154  Edges.push_back(dst);
1155  }
1156 
1157  unsigned count() const { return Edges.size() - 1; }
1158 
1159  MachineBasicBlock *getKey() const {
1160  return Edges.back();
1161  }
1162  };
1163 
1164  if (TriangleChainCount == 0)
1165  return;
1166 
1167  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1168  // Map from last block to the chain that contains it. This allows us to extend
1169  // chains as we find new triangles.
1171  for (MachineBasicBlock &BB : *F) {
1172  // If BB doesn't have 2 successors, it doesn't start a triangle.
1173  if (BB.succ_size() != 2)
1174  continue;
1175  MachineBasicBlock *PDom = nullptr;
1176  for (MachineBasicBlock *Succ : BB.successors()) {
1177  if (!MPDT->dominates(Succ, &BB))
1178  continue;
1179  PDom = Succ;
1180  break;
1181  }
1182  // If BB doesn't have a post-dominating successor, it doesn't form a
1183  // triangle.
1184  if (PDom == nullptr)
1185  continue;
1186  // If PDom has a hint that it is low probability, skip this triangle.
1187  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1188  continue;
1189  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1190  // we're looking for.
1191  if (!shouldTailDuplicate(PDom))
1192  continue;
1193  bool CanTailDuplicate = true;
1194  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1195  // isn't the kind of triangle we're looking for.
1196  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1197  if (Pred == &BB)
1198  continue;
1199  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1200  CanTailDuplicate = false;
1201  break;
1202  }
1203  }
1204  // If we can't tail-duplicate PDom to its predecessors, then skip this
1205  // triangle.
1206  if (!CanTailDuplicate)
1207  continue;
1208 
1209  // Now we have an interesting triangle. Insert it if it's not part of an
1210  // existing chain.
1211  // Note: This cannot be replaced with a call insert() or emplace() because
1212  // the find key is BB, but the insert/emplace key is PDom.
1213  auto Found = TriangleChainMap.find(&BB);
1214  // If it is, remove the chain from the map, grow it, and put it back in the
1215  // map with the end as the new key.
1216  if (Found != TriangleChainMap.end()) {
1217  TriangleChain Chain = std::move(Found->second);
1218  TriangleChainMap.erase(Found);
1219  Chain.append(PDom);
1220  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1221  } else {
1222  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1223  assert(InsertResult.second && "Block seen twice.");
1224  (void)InsertResult;
1225  }
1226  }
1227 
1228  // Iterating over a DenseMap is safe here, because the only thing in the body
1229  // of the loop is inserting into another DenseMap (ComputedEdges).
1230  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1231  for (auto &ChainPair : TriangleChainMap) {
1232  TriangleChain &Chain = ChainPair.second;
1233  // Benchmarking has shown that due to branch correlation duplicating 2 or
1234  // more triangles is profitable, despite the calculations assuming
1235  // independence.
1236  if (Chain.count() < TriangleChainCount)
1237  continue;
1238  MachineBasicBlock *dst = Chain.Edges.back();
1239  Chain.Edges.pop_back();
1240  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1241  LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1242  << getBlockName(dst)
1243  << " as pre-computed based on triangles.\n");
1244 
1245  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1246  assert(InsertResult.second && "Block seen twice.");
1247  (void)InsertResult;
1248 
1249  dst = src;
1250  }
1251  }
1252 }
1253 
1254 // When profile is not present, return the StaticLikelyProb.
1255 // When profile is available, we need to handle the triangle-shape CFG.
1257  const MachineBasicBlock *BB) {
1258  if (!BB->getParent()->getFunction().hasProfileData())
1259  return BranchProbability(StaticLikelyProb, 100);
1260  if (BB->succ_size() == 2) {
1261  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1262  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1263  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1264  /* See case 1 below for the cost analysis. For BB->Succ to
1265  * be taken with smaller cost, the following needs to hold:
1266  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1267  * So the threshold T in the calculation below
1268  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1269  * So T / (1 - T) = 2, Yielding T = 2/3
1270  * Also adding user specified branch bias, we have
1271  * T = (2/3)*(ProfileLikelyProb/50)
1272  * = (2*ProfileLikelyProb)/150)
1273  */
1274  return BranchProbability(2 * ProfileLikelyProb, 150);
1275  }
1276  }
1277  return BranchProbability(ProfileLikelyProb, 100);
1278 }
1279 
1280 /// Checks to see if the layout candidate block \p Succ has a better layout
1281 /// predecessor than \c BB. If yes, returns true.
1282 /// \p SuccProb: The probability adjusted for only remaining blocks.
1283 /// Only used for logging
1284 /// \p RealSuccProb: The un-adjusted probability.
1285 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1286 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1287 /// considered
1288 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1289  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1290  const BlockChain &SuccChain, BranchProbability SuccProb,
1291  BranchProbability RealSuccProb, const BlockChain &Chain,
1292  const BlockFilterSet *BlockFilter) {
1293 
1294  // There isn't a better layout when there are no unscheduled predecessors.
1295  if (SuccChain.UnscheduledPredecessors == 0)
1296  return false;
1297 
1298  // There are two basic scenarios here:
1299  // -------------------------------------
1300  // Case 1: triangular shape CFG (if-then):
1301  // BB
1302  // | \
1303  // | \
1304  // | Pred
1305  // | /
1306  // Succ
1307  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1308  // set Succ as the layout successor of BB. Picking Succ as BB's
1309  // successor breaks the CFG constraints (FIXME: define these constraints).
1310  // With this layout, Pred BB
1311  // is forced to be outlined, so the overall cost will be cost of the
1312  // branch taken from BB to Pred, plus the cost of back taken branch
1313  // from Pred to Succ, as well as the additional cost associated
1314  // with the needed unconditional jump instruction from Pred To Succ.
1315 
1316  // The cost of the topological order layout is the taken branch cost
1317  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1318  // must hold:
1319  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1320  // < freq(BB->Succ) * taken_branch_cost.
1321  // Ignoring unconditional jump cost, we get
1322  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1323  // prob(BB->Succ) > 2 * prob(BB->Pred)
1324  //
1325  // When real profile data is available, we can precisely compute the
1326  // probability threshold that is needed for edge BB->Succ to be considered.
1327  // Without profile data, the heuristic requires the branch bias to be
1328  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1329  // -----------------------------------------------------------------
1330  // Case 2: diamond like CFG (if-then-else):
1331  // S
1332  // / \
1333  // | \
1334  // BB Pred
1335  // \ /
1336  // Succ
1337  // ..
1338  //
1339  // The current block is BB and edge BB->Succ is now being evaluated.
1340  // Note that edge S->BB was previously already selected because
1341  // prob(S->BB) > prob(S->Pred).
1342  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1343  // choose Pred, we will have a topological ordering as shown on the left
1344  // in the picture below. If we choose Succ, we have the solution as shown
1345  // on the right:
1346  //
1347  // topo-order:
1348  //
1349  // S----- ---S
1350  // | | | |
1351  // ---BB | | BB
1352  // | | | |
1353  // | Pred-- | Succ--
1354  // | | | |
1355  // ---Succ ---Pred--
1356  //
1357  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1358  // = freq(S->Pred) + freq(S->BB)
1359  //
1360  // If we have profile data (i.e, branch probabilities can be trusted), the
1361  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1362  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1363  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1364  // means the cost of topological order is greater.
1365  // When profile data is not available, however, we need to be more
1366  // conservative. If the branch prediction is wrong, breaking the topo-order
1367  // will actually yield a layout with large cost. For this reason, we need
1368  // strong biased branch at block S with Prob(S->BB) in order to select
1369  // BB->Succ. This is equivalent to looking the CFG backward with backward
1370  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1371  // profile data).
1372  // --------------------------------------------------------------------------
1373  // Case 3: forked diamond
1374  // S
1375  // / \
1376  // / \
1377  // BB Pred
1378  // | \ / |
1379  // | \ / |
1380  // | X |
1381  // | / \ |
1382  // | / \ |
1383  // S1 S2
1384  //
1385  // The current block is BB and edge BB->S1 is now being evaluated.
1386  // As above S->BB was already selected because
1387  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1388  //
1389  // topo-order:
1390  //
1391  // S-------| ---S
1392  // | | | |
1393  // ---BB | | BB
1394  // | | | |
1395  // | Pred----| | S1----
1396  // | | | |
1397  // --(S1 or S2) ---Pred--
1398  // |
1399  // S2
1400  //
1401  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1402  // + min(freq(Pred->S1), freq(Pred->S2))
1403  // Non-topo-order cost:
1404  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1405  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1406  // is 0. Then the non topo layout is better when
1407  // freq(S->Pred) < freq(BB->S1).
1408  // This is exactly what is checked below.
1409  // Note there are other shapes that apply (Pred may not be a single block,
1410  // but they all fit this general pattern.)
1412 
1413  // Make sure that a hot successor doesn't have a globally more
1414  // important predecessor.
1415  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1416  bool BadCFGConflict = false;
1417 
1418  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1419  if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
1420  (BlockFilter && !BlockFilter->count(Pred)) ||
1421  BlockToChain[Pred] == &Chain ||
1422  // This check is redundant except for look ahead. This function is
1423  // called for lookahead by isProfitableToTailDup when BB hasn't been
1424  // placed yet.
1425  (Pred == BB))
1426  continue;
1427  // Do backward checking.
1428  // For all cases above, we need a backward checking to filter out edges that
1429  // are not 'strongly' biased.
1430  // BB Pred
1431  // \ /
1432  // Succ
1433  // We select edge BB->Succ if
1434  // freq(BB->Succ) > freq(Succ) * HotProb
1435  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1436  // HotProb
1437  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1438  // Case 1 is covered too, because the first equation reduces to:
1439  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1440  BlockFrequency PredEdgeFreq =
1441  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1442  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1443  BadCFGConflict = true;
1444  break;
1445  }
1446  }
1447 
1448  if (BadCFGConflict) {
1449  LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1450  << SuccProb << " (prob) (non-cold CFG conflict)\n");
1451  return true;
1452  }
1453 
1454  return false;
1455 }
1456 
1457 /// Select the best successor for a block.
1458 ///
1459 /// This looks across all successors of a particular block and attempts to
1460 /// select the "best" one to be the layout successor. It only considers direct
1461 /// successors which also pass the block filter. It will attempt to avoid
1462 /// breaking CFG structure, but cave and break such structures in the case of
1463 /// very hot successor edges.
1464 ///
1465 /// \returns The best successor block found, or null if none are viable, along
1466 /// with a boolean indicating if tail duplication is necessary.
1467 MachineBlockPlacement::BlockAndTailDupResult
1468 MachineBlockPlacement::selectBestSuccessor(
1469  const MachineBasicBlock *BB, const BlockChain &Chain,
1470  const BlockFilterSet *BlockFilter) {
1471  const BranchProbability HotProb(StaticLikelyProb, 100);
1472 
1473  BlockAndTailDupResult BestSucc = { nullptr, false };
1474  auto BestProb = BranchProbability::getZero();
1475 
1477  auto AdjustedSumProb =
1478  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1479 
1480  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1481  << "\n");
1482 
1483  // if we already precomputed the best successor for BB, return that if still
1484  // applicable.
1485  auto FoundEdge = ComputedEdges.find(BB);
1486  if (FoundEdge != ComputedEdges.end()) {
1487  MachineBasicBlock *Succ = FoundEdge->second.BB;
1488  ComputedEdges.erase(FoundEdge);
1489  BlockChain *SuccChain = BlockToChain[Succ];
1490  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1491  SuccChain != &Chain && Succ == *SuccChain->begin())
1492  return FoundEdge->second;
1493  }
1494 
1495  // if BB is part of a trellis, Use the trellis to determine the optimal
1496  // fallthrough edges
1497  if (isTrellis(BB, Successors, Chain, BlockFilter))
1498  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1499  BlockFilter);
1500 
1501  // For blocks with CFG violations, we may be able to lay them out anyway with
1502  // tail-duplication. We keep this vector so we can perform the probability
1503  // calculations the minimum number of times.
1505  DupCandidates;
1506  for (MachineBasicBlock *Succ : Successors) {
1507  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1508  BranchProbability SuccProb =
1509  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1510 
1511  BlockChain &SuccChain = *BlockToChain[Succ];
1512  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1513  // predecessor that yields lower global cost.
1514  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1515  Chain, BlockFilter)) {
1516  // If tail duplication would make Succ profitable, place it.
1517  if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1518  DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
1519  continue;
1520  }
1521 
1522  LLVM_DEBUG(
1523  dbgs() << " Candidate: " << getBlockName(Succ)
1524  << ", probability: " << SuccProb
1525  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1526  << "\n");
1527 
1528  if (BestSucc.BB && BestProb >= SuccProb) {
1529  LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1530  continue;
1531  }
1532 
1533  LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1534  BestSucc.BB = Succ;
1535  BestProb = SuccProb;
1536  }
1537  // Handle the tail duplication candidates in order of decreasing probability.
1538  // Stop at the first one that is profitable. Also stop if they are less
1539  // profitable than BestSucc. Position is important because we preserve it and
1540  // prefer first best match. Here we aren't comparing in order, so we capture
1541  // the position instead.
1542  llvm::stable_sort(DupCandidates,
1543  [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1544  std::tuple<BranchProbability, MachineBasicBlock *> R) {
1545  return std::get<0>(L) > std::get<0>(R);
1546  });
1547  for (auto &Tup : DupCandidates) {
1548  BranchProbability DupProb;
1549  MachineBasicBlock *Succ;
1550  std::tie(DupProb, Succ) = Tup;
1551  if (DupProb < BestProb)
1552  break;
1553  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1554  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1555  LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1556  << ", probability: " << DupProb
1557  << " (Tail Duplicate)\n");
1558  BestSucc.BB = Succ;
1559  BestSucc.ShouldTailDup = true;
1560  break;
1561  }
1562  }
1563 
1564  if (BestSucc.BB)
1565  LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1566 
1567  return BestSucc;
1568 }
1569 
1570 /// Select the best block from a worklist.
1571 ///
1572 /// This looks through the provided worklist as a list of candidate basic
1573 /// blocks and select the most profitable one to place. The definition of
1574 /// profitable only really makes sense in the context of a loop. This returns
1575 /// the most frequently visited block in the worklist, which in the case of
1576 /// a loop, is the one most desirable to be physically close to the rest of the
1577 /// loop body in order to improve i-cache behavior.
1578 ///
1579 /// \returns The best block found, or null if none are viable.
1580 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1581  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1582  // Once we need to walk the worklist looking for a candidate, cleanup the
1583  // worklist of already placed entries.
1584  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1585  // some code complexity) into the loop below.
1586  WorkList.erase(llvm::remove_if(WorkList,
1587  [&](MachineBasicBlock *BB) {
1588  return BlockToChain.lookup(BB) == &Chain;
1589  }),
1590  WorkList.end());
1591 
1592  if (WorkList.empty())
1593  return nullptr;
1594 
1595  bool IsEHPad = WorkList[0]->isEHPad();
1596 
1597  MachineBasicBlock *BestBlock = nullptr;
1598  BlockFrequency BestFreq;
1599  for (MachineBasicBlock *MBB : WorkList) {
1600  assert(MBB->isEHPad() == IsEHPad &&
1601  "EHPad mismatch between block and work list.");
1602 
1603  BlockChain &SuccChain = *BlockToChain[MBB];
1604  if (&SuccChain == &Chain)
1605  continue;
1606 
1607  assert(SuccChain.UnscheduledPredecessors == 0 &&
1608  "Found CFG-violating block");
1609 
1610  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1611  LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1612  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1613 
1614  // For ehpad, we layout the least probable first as to avoid jumping back
1615  // from least probable landingpads to more probable ones.
1616  //
1617  // FIXME: Using probability is probably (!) not the best way to achieve
1618  // this. We should probably have a more principled approach to layout
1619  // cleanup code.
1620  //
1621  // The goal is to get:
1622  //
1623  // +--------------------------+
1624  // | V
1625  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1626  //
1627  // Rather than:
1628  //
1629  // +-------------------------------------+
1630  // V |
1631  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1632  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1633  continue;
1634 
1635  BestBlock = MBB;
1636  BestFreq = CandidateFreq;
1637  }
1638 
1639  return BestBlock;
1640 }
1641 
1642 /// Retrieve the first unplaced basic block.
1643 ///
1644 /// This routine is called when we are unable to use the CFG to walk through
1645 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1646 /// We walk through the function's blocks in order, starting from the
1647 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1648 /// re-scanning the entire sequence on repeated calls to this routine.
1649 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1650  const BlockChain &PlacedChain,
1651  MachineFunction::iterator &PrevUnplacedBlockIt,
1652  const BlockFilterSet *BlockFilter) {
1653  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1654  ++I) {
1655  if (BlockFilter && !BlockFilter->count(&*I))
1656  continue;
1657  if (BlockToChain[&*I] != &PlacedChain) {
1658  PrevUnplacedBlockIt = I;
1659  // Now select the head of the chain to which the unplaced block belongs
1660  // as the block to place. This will force the entire chain to be placed,
1661  // and satisfies the requirements of merging chains.
1662  return *BlockToChain[&*I]->begin();
1663  }
1664  }
1665  return nullptr;
1666 }
1667 
1668 void MachineBlockPlacement::fillWorkLists(
1669  const MachineBasicBlock *MBB,
1670  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1671  const BlockFilterSet *BlockFilter = nullptr) {
1672  BlockChain &Chain = *BlockToChain[MBB];
1673  if (!UpdatedPreds.insert(&Chain).second)
1674  return;
1675 
1676  assert(
1677  Chain.UnscheduledPredecessors == 0 &&
1678  "Attempting to place block with unscheduled predecessors in worklist.");
1679  for (MachineBasicBlock *ChainBB : Chain) {
1680  assert(BlockToChain[ChainBB] == &Chain &&
1681  "Block in chain doesn't match BlockToChain map.");
1682  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1683  if (BlockFilter && !BlockFilter->count(Pred))
1684  continue;
1685  if (BlockToChain[Pred] == &Chain)
1686  continue;
1687  ++Chain.UnscheduledPredecessors;
1688  }
1689  }
1690 
1691  if (Chain.UnscheduledPredecessors != 0)
1692  return;
1693 
1694  MachineBasicBlock *BB = *Chain.begin();
1695  if (BB->isEHPad())
1696  EHPadWorkList.push_back(BB);
1697  else
1698  BlockWorkList.push_back(BB);
1699 }
1700 
1701 void MachineBlockPlacement::buildChain(
1702  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1703  BlockFilterSet *BlockFilter) {
1704  assert(HeadBB && "BB must not be null.\n");
1705  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1706  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1707 
1708  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1709  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1710  MachineBasicBlock *BB = *std::prev(Chain.end());
1711  while (true) {
1712  assert(BB && "null block found at end of chain in loop.");
1713  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1714  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1715 
1716 
1717  // Look for the best viable successor if there is one to place immediately
1718  // after this block.
1719  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1720  MachineBasicBlock* BestSucc = Result.BB;
1721  bool ShouldTailDup = Result.ShouldTailDup;
1722  if (allowTailDupPlacement())
1723  ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
1724 
1725  // If an immediate successor isn't available, look for the best viable
1726  // block among those we've identified as not violating the loop's CFG at
1727  // this point. This won't be a fallthrough, but it will increase locality.
1728  if (!BestSucc)
1729  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1730  if (!BestSucc)
1731  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1732 
1733  if (!BestSucc) {
1734  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1735  if (!BestSucc)
1736  break;
1737 
1738  LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1739  "layout successor until the CFG reduces\n");
1740  }
1741 
1742  // Placement may have changed tail duplication opportunities.
1743  // Check for that now.
1744  if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1745  // If the chosen successor was duplicated into all its predecessors,
1746  // don't bother laying it out, just go round the loop again with BB as
1747  // the chain end.
1748  if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1749  BlockFilter, PrevUnplacedBlockIt))
1750  continue;
1751  }
1752 
1753  // Place this block, updating the datastructures to reflect its placement.
1754  BlockChain &SuccChain = *BlockToChain[BestSucc];
1755  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1756  // we selected a successor that didn't fit naturally into the CFG.
1757  SuccChain.UnscheduledPredecessors = 0;
1758  LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1759  << getBlockName(BestSucc) << "\n");
1760  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1761  Chain.merge(BestSucc, &SuccChain);
1762  BB = *std::prev(Chain.end());
1763  }
1764 
1765  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1766  << getBlockName(*Chain.begin()) << "\n");
1767 }
1768 
1769 // If bottom of block BB has only one successor OldTop, in most cases it is
1770 // profitable to move it before OldTop, except the following case:
1771 //
1772 // -->OldTop<-
1773 // | . |
1774 // | . |
1775 // | . |
1776 // ---Pred |
1777 // | |
1778 // BB-----
1779 //
1780 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1781 // layout the other successor below it, so it can't reduce taken branch.
1782 // In this case we keep its original layout.
1783 bool
1784 MachineBlockPlacement::canMoveBottomBlockToTop(
1785  const MachineBasicBlock *BottomBlock,
1786  const MachineBasicBlock *OldTop) {
1787  if (BottomBlock->pred_size() != 1)
1788  return true;
1789  MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1790  if (Pred->succ_size() != 2)
1791  return true;
1792 
1793  MachineBasicBlock *OtherBB = *Pred->succ_begin();
1794  if (OtherBB == BottomBlock)
1795  OtherBB = *Pred->succ_rbegin();
1796  if (OtherBB == OldTop)
1797  return false;
1798 
1799  return true;
1800 }
1801 
1802 // Find out the possible fall through frequence to the top of a loop.
1804 MachineBlockPlacement::TopFallThroughFreq(
1805  const MachineBasicBlock *Top,
1806  const BlockFilterSet &LoopBlockSet) {
1807  BlockFrequency MaxFreq = 0;
1808  for (MachineBasicBlock *Pred : Top->predecessors()) {
1809  BlockChain *PredChain = BlockToChain[Pred];
1810  if (!LoopBlockSet.count(Pred) &&
1811  (!PredChain || Pred == *std::prev(PredChain->end()))) {
1812  // Found a Pred block can be placed before Top.
1813  // Check if Top is the best successor of Pred.
1814  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1815  bool TopOK = true;
1816  for (MachineBasicBlock *Succ : Pred->successors()) {
1817  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1818  BlockChain *SuccChain = BlockToChain[Succ];
1819  // Check if Succ can be placed after Pred.
1820  // Succ should not be in any chain, or it is the head of some chain.
1821  if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1822  (!SuccChain || Succ == *SuccChain->begin())) {
1823  TopOK = false;
1824  break;
1825  }
1826  }
1827  if (TopOK) {
1828  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1829  MBPI->getEdgeProbability(Pred, Top);
1830  if (EdgeFreq > MaxFreq)
1831  MaxFreq = EdgeFreq;
1832  }
1833  }
1834  }
1835  return MaxFreq;
1836 }
1837 
1838 // Compute the fall through gains when move NewTop before OldTop.
1839 //
1840 // In following diagram, edges marked as "-" are reduced fallthrough, edges
1841 // marked as "+" are increased fallthrough, this function computes
1842 //
1843 // SUM(increased fallthrough) - SUM(decreased fallthrough)
1844 //
1845 // |
1846 // | -
1847 // V
1848 // --->OldTop
1849 // | .
1850 // | .
1851 // +| . +
1852 // | Pred --->
1853 // | |-
1854 // | V
1855 // --- NewTop <---
1856 // |-
1857 // V
1858 //
1860 MachineBlockPlacement::FallThroughGains(
1861  const MachineBasicBlock *NewTop,
1862  const MachineBasicBlock *OldTop,
1863  const MachineBasicBlock *ExitBB,
1864  const BlockFilterSet &LoopBlockSet) {
1865  BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
1866  BlockFrequency FallThrough2Exit = 0;
1867  if (ExitBB)
1868  FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
1869  MBPI->getEdgeProbability(NewTop, ExitBB);
1870  BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
1871  MBPI->getEdgeProbability(NewTop, OldTop);
1872 
1873  // Find the best Pred of NewTop.
1874  MachineBasicBlock *BestPred = nullptr;
1875  BlockFrequency FallThroughFromPred = 0;
1876  for (MachineBasicBlock *Pred : NewTop->predecessors()) {
1877  if (!LoopBlockSet.count(Pred))
1878  continue;
1879  BlockChain *PredChain = BlockToChain[Pred];
1880  if (!PredChain || Pred == *std::prev(PredChain->end())) {
1881  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1882  MBPI->getEdgeProbability(Pred, NewTop);
1883  if (EdgeFreq > FallThroughFromPred) {
1884  FallThroughFromPred = EdgeFreq;
1885  BestPred = Pred;
1886  }
1887  }
1888  }
1889 
1890  // If NewTop is not placed after Pred, another successor can be placed
1891  // after Pred.
1892  BlockFrequency NewFreq = 0;
1893  if (BestPred) {
1894  for (MachineBasicBlock *Succ : BestPred->successors()) {
1895  if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
1896  continue;
1897  if (ComputedEdges.find(Succ) != ComputedEdges.end())
1898  continue;
1899  BlockChain *SuccChain = BlockToChain[Succ];
1900  if ((SuccChain && (Succ != *SuccChain->begin())) ||
1901  (SuccChain == BlockToChain[BestPred]))
1902  continue;
1903  BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
1904  MBPI->getEdgeProbability(BestPred, Succ);
1905  if (EdgeFreq > NewFreq)
1906  NewFreq = EdgeFreq;
1907  }
1908  BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
1909  MBPI->getEdgeProbability(BestPred, NewTop);
1910  if (NewFreq > OrigEdgeFreq) {
1911  // If NewTop is not the best successor of Pred, then Pred doesn't
1912  // fallthrough to NewTop. So there is no FallThroughFromPred and
1913  // NewFreq.
1914  NewFreq = 0;
1915  FallThroughFromPred = 0;
1916  }
1917  }
1918 
1919  BlockFrequency Result = 0;
1920  BlockFrequency Gains = BackEdgeFreq + NewFreq;
1921  BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
1922  FallThroughFromPred;
1923  if (Gains > Lost)
1924  Result = Gains - Lost;
1925  return Result;
1926 }
1927 
1928 /// Helper function of findBestLoopTop. Find the best loop top block
1929 /// from predecessors of old top.
1930 ///
1931 /// Look for a block which is strictly better than the old top for laying
1932 /// out before the old top of the loop. This looks for only two patterns:
1933 ///
1934 /// 1. a block has only one successor, the old loop top
1935 ///
1936 /// Because such a block will always result in an unconditional jump,
1937 /// rotating it in front of the old top is always profitable.
1938 ///
1939 /// 2. a block has two successors, one is old top, another is exit
1940 /// and it has more than one predecessors
1941 ///
1942 /// If it is below one of its predecessors P, only P can fall through to
1943 /// it, all other predecessors need a jump to it, and another conditional
1944 /// jump to loop header. If it is moved before loop header, all its
1945 /// predecessors jump to it, then fall through to loop header. So all its
1946 /// predecessors except P can reduce one taken branch.
1947 /// At the same time, move it before old top increases the taken branch
1948 /// to loop exit block, so the reduced taken branch will be compared with
1949 /// the increased taken branch to the loop exit block.
1951 MachineBlockPlacement::findBestLoopTopHelper(
1952  MachineBasicBlock *OldTop,
1953  const MachineLoop &L,
1954  const BlockFilterSet &LoopBlockSet) {
1955  // Check that the header hasn't been fused with a preheader block due to
1956  // crazy branches. If it has, we need to start with the header at the top to
1957  // prevent pulling the preheader into the loop body.
1958  BlockChain &HeaderChain = *BlockToChain[OldTop];
1959  if (!LoopBlockSet.count(*HeaderChain.begin()))
1960  return OldTop;
1961 
1962  LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
1963  << "\n");
1964 
1965  BlockFrequency BestGains = 0;
1966  MachineBasicBlock *BestPred = nullptr;
1967  for (MachineBasicBlock *Pred : OldTop->predecessors()) {
1968  if (!LoopBlockSet.count(Pred))
1969  continue;
1970  if (Pred == L.getHeader())
1971  continue;
1972  LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
1973  << Pred->succ_size() << " successors, ";
1974  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
1975  if (Pred->succ_size() > 2)
1976  continue;
1977 
1978  MachineBasicBlock *OtherBB = nullptr;
1979  if (Pred->succ_size() == 2) {
1980  OtherBB = *Pred->succ_begin();
1981  if (OtherBB == OldTop)
1982  OtherBB = *Pred->succ_rbegin();
1983  }
1984 
1985  if (!canMoveBottomBlockToTop(Pred, OldTop))
1986  continue;
1987 
1988  BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
1989  LoopBlockSet);
1990  if ((Gains > 0) && (Gains > BestGains ||
1991  ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
1992  BestPred = Pred;
1993  BestGains = Gains;
1994  }
1995  }
1996 
1997  // If no direct predecessor is fine, just use the loop header.
1998  if (!BestPred) {
1999  LLVM_DEBUG(dbgs() << " final top unchanged\n");
2000  return OldTop;
2001  }
2002 
2003  // Walk backwards through any straight line of predecessors.
2004  while (BestPred->pred_size() == 1 &&
2005  (*BestPred->pred_begin())->succ_size() == 1 &&
2006  *BestPred->pred_begin() != L.getHeader())
2007  BestPred = *BestPred->pred_begin();
2008 
2009  LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2010  return BestPred;
2011 }
2012 
2013 /// Find the best loop top block for layout.
2014 ///
2015 /// This function iteratively calls findBestLoopTopHelper, until no new better
2016 /// BB can be found.
2018 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2019  const BlockFilterSet &LoopBlockSet) {
2020  // Placing the latch block before the header may introduce an extra branch
2021  // that skips this block the first time the loop is executed, which we want
2022  // to avoid when optimising for size.
2023  // FIXME: in theory there is a case that does not introduce a new branch,
2024  // i.e. when the layout predecessor does not fallthrough to the loop header.
2025  // In practice this never happens though: there always seems to be a preheader
2026  // that can fallthrough and that is also placed before the header.
2027  if (F->getFunction().hasOptSize())
2028  return L.getHeader();
2029 
2030  MachineBasicBlock *OldTop = nullptr;
2031  MachineBasicBlock *NewTop = L.getHeader();
2032  while (NewTop != OldTop) {
2033  OldTop = NewTop;
2034  NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2035  if (NewTop != OldTop)
2036  ComputedEdges[NewTop] = { OldTop, false };
2037  }
2038  return NewTop;
2039 }
2040 
2041 /// Find the best loop exiting block for layout.
2042 ///
2043 /// This routine implements the logic to analyze the loop looking for the best
2044 /// block to layout at the top of the loop. Typically this is done to maximize
2045 /// fallthrough opportunities.
2047 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2048  const BlockFilterSet &LoopBlockSet,
2049  BlockFrequency &ExitFreq) {
2050  // We don't want to layout the loop linearly in all cases. If the loop header
2051  // is just a normal basic block in the loop, we want to look for what block
2052  // within the loop is the best one to layout at the top. However, if the loop
2053  // header has be pre-merged into a chain due to predecessors not having
2054  // analyzable branches, *and* the predecessor it is merged with is *not* part
2055  // of the loop, rotating the header into the middle of the loop will create
2056  // a non-contiguous range of blocks which is Very Bad. So start with the
2057  // header and only rotate if safe.
2058  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2059  if (!LoopBlockSet.count(*HeaderChain.begin()))
2060  return nullptr;
2061 
2062  BlockFrequency BestExitEdgeFreq;
2063  unsigned BestExitLoopDepth = 0;
2064  MachineBasicBlock *ExitingBB = nullptr;
2065  // If there are exits to outer loops, loop rotation can severely limit
2066  // fallthrough opportunities unless it selects such an exit. Keep a set of
2067  // blocks where rotating to exit with that block will reach an outer loop.
2068  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2069 
2070  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2071  << getBlockName(L.getHeader()) << "\n");
2072  for (MachineBasicBlock *MBB : L.getBlocks()) {
2073  BlockChain &Chain = *BlockToChain[MBB];
2074  // Ensure that this block is at the end of a chain; otherwise it could be
2075  // mid-way through an inner loop or a successor of an unanalyzable branch.
2076  if (MBB != *std::prev(Chain.end()))
2077  continue;
2078 
2079  // Now walk the successors. We need to establish whether this has a viable
2080  // exiting successor and whether it has a viable non-exiting successor.
2081  // We store the old exiting state and restore it if a viable looping
2082  // successor isn't found.
2083  MachineBasicBlock *OldExitingBB = ExitingBB;
2084  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2085  bool HasLoopingSucc = false;
2086  for (MachineBasicBlock *Succ : MBB->successors()) {
2087  if (Succ->isEHPad())
2088  continue;
2089  if (Succ == MBB)
2090  continue;
2091  BlockChain &SuccChain = *BlockToChain[Succ];
2092  // Don't split chains, either this chain or the successor's chain.
2093  if (&Chain == &SuccChain) {
2094  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2095  << getBlockName(Succ) << " (chain conflict)\n");
2096  continue;
2097  }
2098 
2099  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2100  if (LoopBlockSet.count(Succ)) {
2101  LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2102  << getBlockName(Succ) << " (" << SuccProb << ")\n");
2103  HasLoopingSucc = true;
2104  continue;
2105  }
2106 
2107  unsigned SuccLoopDepth = 0;
2108  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2109  SuccLoopDepth = ExitLoop->getLoopDepth();
2110  if (ExitLoop->contains(&L))
2111  BlocksExitingToOuterLoop.insert(MBB);
2112  }
2113 
2114  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2115  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2116  << getBlockName(Succ) << " [L:" << SuccLoopDepth
2117  << "] (";
2118  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
2119  // Note that we bias this toward an existing layout successor to retain
2120  // incoming order in the absence of better information. The exit must have
2121  // a frequency higher than the current exit before we consider breaking
2122  // the layout.
2123  BranchProbability Bias(100 - ExitBlockBias, 100);
2124  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2125  ExitEdgeFreq > BestExitEdgeFreq ||
2126  (MBB->isLayoutSuccessor(Succ) &&
2127  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2128  BestExitEdgeFreq = ExitEdgeFreq;
2129  ExitingBB = MBB;
2130  }
2131  }
2132 
2133  if (!HasLoopingSucc) {
2134  // Restore the old exiting state, no viable looping successor was found.
2135  ExitingBB = OldExitingBB;
2136  BestExitEdgeFreq = OldBestExitEdgeFreq;
2137  }
2138  }
2139  // Without a candidate exiting block or with only a single block in the
2140  // loop, just use the loop header to layout the loop.
2141  if (!ExitingBB) {
2142  LLVM_DEBUG(
2143  dbgs() << " No other candidate exit blocks, using loop header\n");
2144  return nullptr;
2145  }
2146  if (L.getNumBlocks() == 1) {
2147  LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2148  return nullptr;
2149  }
2150 
2151  // Also, if we have exit blocks which lead to outer loops but didn't select
2152  // one of them as the exiting block we are rotating toward, disable loop
2153  // rotation altogether.
2154  if (!BlocksExitingToOuterLoop.empty() &&
2155  !BlocksExitingToOuterLoop.count(ExitingBB))
2156  return nullptr;
2157 
2158  LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2159  << "\n");
2160  ExitFreq = BestExitEdgeFreq;
2161  return ExitingBB;
2162 }
2163 
2164 /// Check if there is a fallthrough to loop header Top.
2165 ///
2166 /// 1. Look for a Pred that can be layout before Top.
2167 /// 2. Check if Top is the most possible successor of Pred.
2168 bool
2169 MachineBlockPlacement::hasViableTopFallthrough(
2170  const MachineBasicBlock *Top,
2171  const BlockFilterSet &LoopBlockSet) {
2172  for (MachineBasicBlock *Pred : Top->predecessors()) {
2173  BlockChain *PredChain = BlockToChain[Pred];
2174  if (!LoopBlockSet.count(Pred) &&
2175  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2176  // Found a Pred block can be placed before Top.
2177  // Check if Top is the best successor of Pred.
2178  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2179  bool TopOK = true;
2180  for (MachineBasicBlock *Succ : Pred->successors()) {
2181  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2182  BlockChain *SuccChain = BlockToChain[Succ];
2183  // Check if Succ can be placed after Pred.
2184  // Succ should not be in any chain, or it is the head of some chain.
2185  if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2186  TopOK = false;
2187  break;
2188  }
2189  }
2190  if (TopOK)
2191  return true;
2192  }
2193  }
2194  return false;
2195 }
2196 
2197 /// Attempt to rotate an exiting block to the bottom of the loop.
2198 ///
2199 /// Once we have built a chain, try to rotate it to line up the hot exit block
2200 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2201 /// branches. For example, if the loop has fallthrough into its header and out
2202 /// of its bottom already, don't rotate it.
2203 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2204  const MachineBasicBlock *ExitingBB,
2205  BlockFrequency ExitFreq,
2206  const BlockFilterSet &LoopBlockSet) {
2207  if (!ExitingBB)
2208  return;
2209 
2210  MachineBasicBlock *Top = *LoopChain.begin();
2211  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2212 
2213  // If ExitingBB is already the last one in a chain then nothing to do.
2214  if (Bottom == ExitingBB)
2215  return;
2216 
2217  bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2218 
2219  // If the header has viable fallthrough, check whether the current loop
2220  // bottom is a viable exiting block. If so, bail out as rotating will
2221  // introduce an unnecessary branch.
2222  if (ViableTopFallthrough) {
2223  for (MachineBasicBlock *Succ : Bottom->successors()) {
2224  BlockChain *SuccChain = BlockToChain[Succ];
2225  if (!LoopBlockSet.count(Succ) &&
2226  (!SuccChain || Succ == *SuccChain->begin()))
2227  return;
2228  }
2229 
2230  // Rotate will destroy the top fallthrough, we need to ensure the new exit
2231  // frequency is larger than top fallthrough.
2232  BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2233  if (FallThrough2Top >= ExitFreq)
2234  return;
2235  }
2236 
2237  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2238  if (ExitIt == LoopChain.end())
2239  return;
2240 
2241  // Rotating a loop exit to the bottom when there is a fallthrough to top
2242  // trades the entry fallthrough for an exit fallthrough.
2243  // If there is no bottom->top edge, but the chosen exit block does have
2244  // a fallthrough, we break that fallthrough for nothing in return.
2245 
2246  // Let's consider an example. We have a built chain of basic blocks
2247  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2248  // By doing a rotation we get
2249  // Bk+1, ..., Bn, B1, ..., Bk
2250  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2251  // If we had a fallthrough Bk -> Bk+1 it is broken now.
2252  // It might be compensated by fallthrough Bn -> B1.
2253  // So we have a condition to avoid creation of extra branch by loop rotation.
2254  // All below must be true to avoid loop rotation:
2255  // If there is a fallthrough to top (B1)
2256  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2257  // There is no fallthrough from bottom (Bn) to top (B1).
2258  // Please note that there is no exit fallthrough from Bn because we checked it
2259  // above.
2260  if (ViableTopFallthrough) {
2261  assert(std::next(ExitIt) != LoopChain.end() &&
2262  "Exit should not be last BB");
2263  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2264  if (ExitingBB->isSuccessor(NextBlockInChain))
2265  if (!Bottom->isSuccessor(Top))
2266  return;
2267  }
2268 
2269  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2270  << " at bottom\n");
2271  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2272 }
2273 
2274 /// Attempt to rotate a loop based on profile data to reduce branch cost.
2275 ///
2276 /// With profile data, we can determine the cost in terms of missed fall through
2277 /// opportunities when rotating a loop chain and select the best rotation.
2278 /// Basically, there are three kinds of cost to consider for each rotation:
2279 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2280 /// the loop to the loop header.
2281 /// 2. The possibly missed fall through edges (if they exist) from the loop
2282 /// exits to BB out of the loop.
2283 /// 3. The missed fall through edge (if it exists) from the last BB to the
2284 /// first BB in the loop chain.
2285 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2286 /// We select the best rotation with the smallest cost.
2287 void MachineBlockPlacement::rotateLoopWithProfile(
2288  BlockChain &LoopChain, const MachineLoop &L,
2289  const BlockFilterSet &LoopBlockSet) {
2290  auto RotationPos = LoopChain.end();
2291 
2292  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2293 
2294  // A utility lambda that scales up a block frequency by dividing it by a
2295  // branch probability which is the reciprocal of the scale.
2296  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2297  unsigned Scale) -> BlockFrequency {
2298  if (Scale == 0)
2299  return 0;
2300  // Use operator / between BlockFrequency and BranchProbability to implement
2301  // saturating multiplication.
2302  return Freq / BranchProbability(1, Scale);
2303  };
2304 
2305  // Compute the cost of the missed fall-through edge to the loop header if the
2306  // chain head is not the loop header. As we only consider natural loops with
2307  // single header, this computation can be done only once.
2308  BlockFrequency HeaderFallThroughCost(0);
2309  MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2310  for (auto *Pred : ChainHeaderBB->predecessors()) {
2311  BlockChain *PredChain = BlockToChain[Pred];
2312  if (!LoopBlockSet.count(Pred) &&
2313  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2314  auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2315  MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2316  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2317  // If the predecessor has only an unconditional jump to the header, we
2318  // need to consider the cost of this jump.
2319  if (Pred->succ_size() == 1)
2320  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2321  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2322  }
2323  }
2324 
2325  // Here we collect all exit blocks in the loop, and for each exit we find out
2326  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2327  // as the sum of frequencies of exit edges we collect here, excluding the exit
2328  // edge from the tail of the loop chain.
2330  for (auto BB : LoopChain) {
2331  auto LargestExitEdgeProb = BranchProbability::getZero();
2332  for (auto *Succ : BB->successors()) {
2333  BlockChain *SuccChain = BlockToChain[Succ];
2334  if (!LoopBlockSet.count(Succ) &&
2335  (!SuccChain || Succ == *SuccChain->begin())) {
2336  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2337  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2338  }
2339  }
2340  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2341  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2342  ExitsWithFreq.emplace_back(BB, ExitFreq);
2343  }
2344  }
2345 
2346  // In this loop we iterate every block in the loop chain and calculate the
2347  // cost assuming the block is the head of the loop chain. When the loop ends,
2348  // we should have found the best candidate as the loop chain's head.
2349  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2350  EndIter = LoopChain.end();
2351  Iter != EndIter; Iter++, TailIter++) {
2352  // TailIter is used to track the tail of the loop chain if the block we are
2353  // checking (pointed by Iter) is the head of the chain.
2354  if (TailIter == LoopChain.end())
2355  TailIter = LoopChain.begin();
2356 
2357  auto TailBB = *TailIter;
2358 
2359  // Calculate the cost by putting this BB to the top.
2360  BlockFrequency Cost = 0;
2361 
2362  // If the current BB is the loop header, we need to take into account the
2363  // cost of the missed fall through edge from outside of the loop to the
2364  // header.
2365  if (Iter != LoopChain.begin())
2366  Cost += HeaderFallThroughCost;
2367 
2368  // Collect the loop exit cost by summing up frequencies of all exit edges
2369  // except the one from the chain tail.
2370  for (auto &ExitWithFreq : ExitsWithFreq)
2371  if (TailBB != ExitWithFreq.first)
2372  Cost += ExitWithFreq.second;
2373 
2374  // The cost of breaking the once fall-through edge from the tail to the top
2375  // of the loop chain. Here we need to consider three cases:
2376  // 1. If the tail node has only one successor, then we will get an
2377  // additional jmp instruction. So the cost here is (MisfetchCost +
2378  // JumpInstCost) * tail node frequency.
2379  // 2. If the tail node has two successors, then we may still get an
2380  // additional jmp instruction if the layout successor after the loop
2381  // chain is not its CFG successor. Note that the more frequently executed
2382  // jmp instruction will be put ahead of the other one. Assume the
2383  // frequency of those two branches are x and y, where x is the frequency
2384  // of the edge to the chain head, then the cost will be
2385  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2386  // 3. If the tail node has more than two successors (this rarely happens),
2387  // we won't consider any additional cost.
2388  if (TailBB->isSuccessor(*Iter)) {
2389  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2390  if (TailBB->succ_size() == 1)
2391  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2393  else if (TailBB->succ_size() == 2) {
2394  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2395  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2396  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2397  ? TailBBFreq * TailToHeadProb.getCompl()
2398  : TailToHeadFreq;
2399  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2400  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2401  }
2402  }
2403 
2404  LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2405  << getBlockName(*Iter)
2406  << " to the top: " << Cost.getFrequency() << "\n");
2407 
2408  if (Cost < SmallestRotationCost) {
2409  SmallestRotationCost = Cost;
2410  RotationPos = Iter;
2411  }
2412  }
2413 
2414  if (RotationPos != LoopChain.end()) {
2415  LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2416  << " to the top\n");
2417  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2418  }
2419 }
2420 
2421 /// Collect blocks in the given loop that are to be placed.
2422 ///
2423 /// When profile data is available, exclude cold blocks from the returned set;
2424 /// otherwise, collect all blocks in the loop.
2426 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2427  BlockFilterSet LoopBlockSet;
2428 
2429  // Filter cold blocks off from LoopBlockSet when profile data is available.
2430  // Collect the sum of frequencies of incoming edges to the loop header from
2431  // outside. If we treat the loop as a super block, this is the frequency of
2432  // the loop. Then for each block in the loop, we calculate the ratio between
2433  // its frequency and the frequency of the loop block. When it is too small,
2434  // don't add it to the loop chain. If there are outer loops, then this block
2435  // will be merged into the first outer loop chain for which this block is not
2436  // cold anymore. This needs precise profile data and we only do this when
2437  // profile data is available.
2439  BlockFrequency LoopFreq(0);
2440  for (auto LoopPred : L.getHeader()->predecessors())
2441  if (!L.contains(LoopPred))
2442  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2443  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2444 
2445  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2446  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2447  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2448  continue;
2449  LoopBlockSet.insert(LoopBB);
2450  }
2451  } else
2452  LoopBlockSet.insert(L.block_begin(), L.block_end());
2453 
2454  return LoopBlockSet;
2455 }
2456 
2457 /// Forms basic block chains from the natural loop structures.
2458 ///
2459 /// These chains are designed to preserve the existing *structure* of the code
2460 /// as much as possible. We can then stitch the chains together in a way which
2461 /// both preserves the topological structure and minimizes taken conditional
2462 /// branches.
2463 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2464  // First recurse through any nested loops, building chains for those inner
2465  // loops.
2466  for (const MachineLoop *InnerLoop : L)
2467  buildLoopChains(*InnerLoop);
2468 
2469  assert(BlockWorkList.empty() &&
2470  "BlockWorkList not empty when starting to build loop chains.");
2471  assert(EHPadWorkList.empty() &&
2472  "EHPadWorkList not empty when starting to build loop chains.");
2473  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2474 
2475  // Check if we have profile data for this function. If yes, we will rotate
2476  // this loop by modeling costs more precisely which requires the profile data
2477  // for better layout.
2478  bool RotateLoopWithProfile =
2481 
2482  // First check to see if there is an obviously preferable top block for the
2483  // loop. This will default to the header, but may end up as one of the
2484  // predecessors to the header if there is one which will result in strictly
2485  // fewer branches in the loop body.
2486  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2487 
2488  // If we selected just the header for the loop top, look for a potentially
2489  // profitable exit block in the event that rotating the loop can eliminate
2490  // branches by placing an exit edge at the bottom.
2491  //
2492  // Loops are processed innermost to uttermost, make sure we clear
2493  // PreferredLoopExit before processing a new loop.
2494  PreferredLoopExit = nullptr;
2495  BlockFrequency ExitFreq;
2496  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2497  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2498 
2499  BlockChain &LoopChain = *BlockToChain[LoopTop];
2500 
2501  // FIXME: This is a really lame way of walking the chains in the loop: we
2502  // walk the blocks, and use a set to prevent visiting a particular chain
2503  // twice.
2504  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2505  assert(LoopChain.UnscheduledPredecessors == 0 &&
2506  "LoopChain should not have unscheduled predecessors.");
2507  UpdatedPreds.insert(&LoopChain);
2508 
2509  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2510  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2511 
2512  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2513 
2514  if (RotateLoopWithProfile)
2515  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2516  else
2517  rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2518 
2519  LLVM_DEBUG({
2520  // Crash at the end so we get all of the debugging output first.
2521  bool BadLoop = false;
2522  if (LoopChain.UnscheduledPredecessors) {
2523  BadLoop = true;
2524  dbgs() << "Loop chain contains a block without its preds placed!\n"
2525  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2526  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2527  }
2528  for (MachineBasicBlock *ChainBB : LoopChain) {
2529  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2530  if (!LoopBlockSet.remove(ChainBB)) {
2531  // We don't mark the loop as bad here because there are real situations
2532  // where this can occur. For example, with an unanalyzable fallthrough
2533  // from a loop block to a non-loop block or vice versa.
2534  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2535  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2536  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2537  << " Bad block: " << getBlockName(ChainBB) << "\n";
2538  }
2539  }
2540 
2541  if (!LoopBlockSet.empty()) {
2542  BadLoop = true;
2543  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2544  dbgs() << "Loop contains blocks never placed into a chain!\n"
2545  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2546  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2547  << " Bad block: " << getBlockName(LoopBB) << "\n";
2548  }
2549  assert(!BadLoop && "Detected problems with the placement of this loop.");
2550  });
2551 
2552  BlockWorkList.clear();
2553  EHPadWorkList.clear();
2554 }
2555 
2556 void MachineBlockPlacement::buildCFGChains() {
2557  // Ensure that every BB in the function has an associated chain to simplify
2558  // the assumptions of the remaining algorithm.
2559  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2560  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2561  ++FI) {
2562  MachineBasicBlock *BB = &*FI;
2563  BlockChain *Chain =
2564  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2565  // Also, merge any blocks which we cannot reason about and must preserve
2566  // the exact fallthrough behavior for.
2567  while (true) {
2568  Cond.clear();
2569  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2570  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2571  break;
2572 
2573  MachineFunction::iterator NextFI = std::next(FI);
2574  MachineBasicBlock *NextBB = &*NextFI;
2575  // Ensure that the layout successor is a viable block, as we know that
2576  // fallthrough is a possibility.
2577  assert(NextFI != FE && "Can't fallthrough past the last block.");
2578  LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2579  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2580  << "\n");
2581  Chain->merge(NextBB, nullptr);
2582 #ifndef NDEBUG
2583  BlocksWithUnanalyzableExits.insert(&*BB);
2584 #endif
2585  FI = NextFI;
2586  BB = NextBB;
2587  }
2588  }
2589 
2590  // Build any loop-based chains.
2591  PreferredLoopExit = nullptr;
2592  for (MachineLoop *L : *MLI)
2593  buildLoopChains(*L);
2594 
2595  assert(BlockWorkList.empty() &&
2596  "BlockWorkList should be empty before building final chain.");
2597  assert(EHPadWorkList.empty() &&
2598  "EHPadWorkList should be empty before building final chain.");
2599 
2600  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2601  for (MachineBasicBlock &MBB : *F)
2602  fillWorkLists(&MBB, UpdatedPreds);
2603 
2604  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2605  buildChain(&F->front(), FunctionChain);
2606 
2607 #ifndef NDEBUG
2608  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2609 #endif
2610  LLVM_DEBUG({
2611  // Crash at the end so we get all of the debugging output first.
2612  bool BadFunc = false;
2613  FunctionBlockSetType FunctionBlockSet;
2614  for (MachineBasicBlock &MBB : *F)
2615  FunctionBlockSet.insert(&MBB);
2616 
2617  for (MachineBasicBlock *ChainBB : FunctionChain)
2618  if (!FunctionBlockSet.erase(ChainBB)) {
2619  BadFunc = true;
2620  dbgs() << "Function chain contains a block not in the function!\n"
2621  << " Bad block: " << getBlockName(ChainBB) << "\n";
2622  }
2623 
2624  if (!FunctionBlockSet.empty()) {
2625  BadFunc = true;
2626  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2627  dbgs() << "Function contains blocks never placed into a chain!\n"
2628  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2629  }
2630  assert(!BadFunc && "Detected problems with the block placement.");
2631  });
2632 
2633  // Splice the blocks into place.
2634  MachineFunction::iterator InsertPos = F->begin();
2635  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2636  for (MachineBasicBlock *ChainBB : FunctionChain) {
2637  LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2638  : " ... ")
2639  << getBlockName(ChainBB) << "\n");
2640  if (InsertPos != MachineFunction::iterator(ChainBB))
2641  F->splice(InsertPos, ChainBB);
2642  else
2643  ++InsertPos;
2644 
2645  // Update the terminator of the previous block.
2646  if (ChainBB == *FunctionChain.begin())
2647  continue;
2648  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2649 
2650  // FIXME: It would be awesome of updateTerminator would just return rather
2651  // than assert when the branch cannot be analyzed in order to remove this
2652  // boiler plate.
2653  Cond.clear();
2654  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2655 
2656 #ifndef NDEBUG
2657  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2658  // Given the exact block placement we chose, we may actually not _need_ to
2659  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2660  // do that at this point is a bug.
2661  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2662  !PrevBB->canFallThrough()) &&
2663  "Unexpected block with un-analyzable fallthrough!");
2664  Cond.clear();
2665  TBB = FBB = nullptr;
2666  }
2667 #endif
2668 
2669  // The "PrevBB" is not yet updated to reflect current code layout, so,
2670  // o. it may fall-through to a block without explicit "goto" instruction
2671  // before layout, and no longer fall-through it after layout; or
2672  // o. just opposite.
2673  //
2674  // analyzeBranch() may return erroneous value for FBB when these two
2675  // situations take place. For the first scenario FBB is mistakenly set NULL;
2676  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2677  // mistakenly pointing to "*BI".
2678  // Thus, if the future change needs to use FBB before the layout is set, it
2679  // has to correct FBB first by using the code similar to the following:
2680  //
2681  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2682  // PrevBB->updateTerminator();
2683  // Cond.clear();
2684  // TBB = FBB = nullptr;
2685  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2686  // // FIXME: This should never take place.
2687  // TBB = FBB = nullptr;
2688  // }
2689  // }
2690  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
2691  PrevBB->updateTerminator();
2692  }
2693 
2694  // Fixup the last block.
2695  Cond.clear();
2696  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2697  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
2698  F->back().updateTerminator();
2699 
2700  BlockWorkList.clear();
2701  EHPadWorkList.clear();
2702 }
2703 
2704 void MachineBlockPlacement::optimizeBranches() {
2705  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2706  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2707 
2708  // Now that all the basic blocks in the chain have the proper layout,
2709  // make a final call to AnalyzeBranch with AllowModify set.
2710  // Indeed, the target may be able to optimize the branches in a way we
2711  // cannot because all branches may not be analyzable.
2712  // E.g., the target may be able to remove an unconditional branch to
2713  // a fallthrough when it occurs after predicated terminators.
2714  for (MachineBasicBlock *ChainBB : FunctionChain) {
2715  Cond.clear();
2716  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2717  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2718  // If PrevBB has a two-way branch, try to re-order the branches
2719  // such that we branch to the successor with higher probability first.
2720  if (TBB && !Cond.empty() && FBB &&
2721  MBPI->getEdgeProbability(ChainBB, FBB) >
2722  MBPI->getEdgeProbability(ChainBB, TBB) &&
2723  !TII->reverseBranchCondition(Cond)) {
2724  LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2725  << getBlockName(ChainBB) << "\n");
2726  LLVM_DEBUG(dbgs() << " Edge probability: "
2727  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2728  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2729  DebugLoc dl; // FIXME: this is nowhere
2730  TII->removeBranch(*ChainBB);
2731  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2732  ChainBB->updateTerminator();
2733  }
2734  }
2735  }
2736 }
2737 
2738 void MachineBlockPlacement::alignBlocks() {
2739  // Walk through the backedges of the function now that we have fully laid out
2740  // the basic blocks and align the destination of each backedge. We don't rely
2741  // exclusively on the loop info here so that we can align backedges in
2742  // unnatural CFGs and backedges that were introduced purely because of the
2743  // loop rotations done during this layout pass.
2744  if (F->getFunction().hasMinSize() ||
2745  (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2746  return;
2747  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2748  if (FunctionChain.begin() == FunctionChain.end())
2749  return; // Empty chain.
2750 
2751  const BranchProbability ColdProb(1, 5); // 20%
2752  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2753  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2754  for (MachineBasicBlock *ChainBB : FunctionChain) {
2755  if (ChainBB == *FunctionChain.begin())
2756  continue;
2757 
2758  // Don't align non-looping basic blocks. These are unlikely to execute
2759  // enough times to matter in practice. Note that we'll still handle
2760  // unnatural CFGs inside of a natural outer loop (the common case) and
2761  // rotated loops.
2762  MachineLoop *L = MLI->getLoopFor(ChainBB);
2763  if (!L)
2764  continue;
2765 
2766  unsigned Align = TLI->getPrefLoopAlignment(L);
2767  if (!Align)
2768  continue; // Don't care about loop alignment.
2769 
2770  // If the block is cold relative to the function entry don't waste space
2771  // aligning it.
2772  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2773  if (Freq < WeightedEntryFreq)
2774  continue;
2775 
2776  // If the block is cold relative to its loop header, don't align it
2777  // regardless of what edges into the block exist.
2778  MachineBasicBlock *LoopHeader = L->getHeader();
2779  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2780  if (Freq < (LoopHeaderFreq * ColdProb))
2781  continue;
2782 
2783  // Check for the existence of a non-layout predecessor which would benefit
2784  // from aligning this block.
2785  MachineBasicBlock *LayoutPred =
2786  &*std::prev(MachineFunction::iterator(ChainBB));
2787 
2788  // Force alignment if all the predecessors are jumps. We already checked
2789  // that the block isn't cold above.
2790  if (!LayoutPred->isSuccessor(ChainBB)) {
2791  ChainBB->setAlignment(Align);
2792  continue;
2793  }
2794 
2795  // Align this block if the layout predecessor's edge into this block is
2796  // cold relative to the block. When this is true, other predecessors make up
2797  // all of the hot entries into the block and thus alignment is likely to be
2798  // important.
2799  BranchProbability LayoutProb =
2800  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2801  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2802  if (LayoutEdgeFreq <= (Freq * ColdProb))
2803  ChainBB->setAlignment(Align);
2804  }
2805 }
2806 
2807 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2808 /// it was duplicated into its chain predecessor and removed.
2809 /// \p BB - Basic block that may be duplicated.
2810 ///
2811 /// \p LPred - Chosen layout predecessor of \p BB.
2812 /// Updated to be the chain end if LPred is removed.
2813 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2814 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2815 /// Used to identify which blocks to update predecessor
2816 /// counts.
2817 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2818 /// chosen in the given order due to unnatural CFG
2819 /// only needed if \p BB is removed and
2820 /// \p PrevUnplacedBlockIt pointed to \p BB.
2821 /// @return true if \p BB was removed.
2822 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2823  MachineBasicBlock *BB, MachineBasicBlock *&LPred,
2824  const MachineBasicBlock *LoopHeaderBB,
2825  BlockChain &Chain, BlockFilterSet *BlockFilter,
2826  MachineFunction::iterator &PrevUnplacedBlockIt) {
2827  bool Removed, DuplicatedToLPred;
2828  bool DuplicatedToOriginalLPred;
2829  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
2830  PrevUnplacedBlockIt,
2831  DuplicatedToLPred);
2832  if (!Removed)
2833  return false;
2834  DuplicatedToOriginalLPred = DuplicatedToLPred;
2835  // Iteratively try to duplicate again. It can happen that a block that is
2836  // duplicated into is still small enough to be duplicated again.
2837  // No need to call markBlockSuccessors in this case, as the blocks being
2838  // duplicated from here on are already scheduled.
2839  // Note that DuplicatedToLPred always implies Removed.
2840  while (DuplicatedToLPred) {
2841  assert(Removed && "Block must have been removed to be duplicated into its "
2842  "layout predecessor.");
2843  MachineBasicBlock *DupBB, *DupPred;
2844  // The removal callback causes Chain.end() to be updated when a block is
2845  // removed. On the first pass through the loop, the chain end should be the
2846  // same as it was on function entry. On subsequent passes, because we are
2847  // duplicating the block at the end of the chain, if it is removed the
2848  // chain will have shrunk by one block.
2849  BlockChain::iterator ChainEnd = Chain.end();
2850  DupBB = *(--ChainEnd);
2851  // Now try to duplicate again.
2852  if (ChainEnd == Chain.begin())
2853  break;
2854  DupPred = *std::prev(ChainEnd);
2855  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
2856  PrevUnplacedBlockIt,
2857  DuplicatedToLPred);
2858  }
2859  // If BB was duplicated into LPred, it is now scheduled. But because it was
2860  // removed, markChainSuccessors won't be called for its chain. Instead we
2861  // call markBlockSuccessors for LPred to achieve the same effect. This must go
2862  // at the end because repeating the tail duplication can increase the number
2863  // of unscheduled predecessors.
2864  LPred = *std::prev(Chain.end());
2865  if (DuplicatedToOriginalLPred)
2866  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
2867  return true;
2868 }
2869 
2870 /// Tail duplicate \p BB into (some) predecessors if profitable.
2871 /// \p BB - Basic block that may be duplicated
2872 /// \p LPred - Chosen layout predecessor of \p BB
2873 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2874 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2875 /// Used to identify which blocks to update predecessor
2876 /// counts.
2877 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2878 /// chosen in the given order due to unnatural CFG
2879 /// only needed if \p BB is removed and
2880 /// \p PrevUnplacedBlockIt pointed to \p BB.
2881 /// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will
2882 /// only be true if the block was removed.
2883 /// \return - True if the block was duplicated into all preds and removed.
2884 bool MachineBlockPlacement::maybeTailDuplicateBlock(
2886  BlockChain &Chain, BlockFilterSet *BlockFilter,
2887  MachineFunction::iterator &PrevUnplacedBlockIt,
2888  bool &DuplicatedToLPred) {
2889  DuplicatedToLPred = false;
2890  if (!shouldTailDuplicate(BB))
2891  return false;
2892 
2893  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
2894  << "\n");
2895 
2896  // This has to be a callback because none of it can be done after
2897  // BB is deleted.
2898  bool Removed = false;
2899  auto RemovalCallback =
2900  [&](MachineBasicBlock *RemBB) {
2901  // Signal to outer function
2902  Removed = true;
2903 
2904  // Conservative default.
2905  bool InWorkList = true;
2906  // Remove from the Chain and Chain Map
2907  if (BlockToChain.count(RemBB)) {
2908  BlockChain *Chain = BlockToChain[RemBB];
2909  InWorkList = Chain->UnscheduledPredecessors == 0;
2910  Chain->remove(RemBB);
2911  BlockToChain.erase(RemBB);
2912  }
2913 
2914  // Handle the unplaced block iterator
2915  if (&(*PrevUnplacedBlockIt) == RemBB) {
2916  PrevUnplacedBlockIt++;
2917  }
2918 
2919  // Handle the Work Lists
2920  if (InWorkList) {
2921  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
2922  if (RemBB->isEHPad())
2923  RemoveList = EHPadWorkList;
2924  RemoveList.erase(
2925  llvm::remove_if(RemoveList,
2926  [RemBB](MachineBasicBlock *BB) {
2927  return BB == RemBB;
2928  }),
2929  RemoveList.end());
2930  }
2931 
2932  // Handle the filter set
2933  if (BlockFilter) {
2934  BlockFilter->remove(RemBB);
2935  }
2936 
2937  // Remove the block from loop info.
2938  MLI->removeBlock(RemBB);
2939  if (RemBB == PreferredLoopExit)
2940  PreferredLoopExit = nullptr;
2941 
2942  LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
2943  << getBlockName(RemBB) << "\n");
2944  };
2945  auto RemovalCallbackRef =
2946  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
2947 
2948  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
2949  bool IsSimple = TailDup.isSimpleBB(BB);
2950  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
2951  &DuplicatedPreds, &RemovalCallbackRef);
2952 
2953  // Update UnscheduledPredecessors to reflect tail-duplication.
2954  DuplicatedToLPred = false;
2955  for (MachineBasicBlock *Pred : DuplicatedPreds) {
2956  // We're only looking for unscheduled predecessors that match the filter.
2957  BlockChain* PredChain = BlockToChain[Pred];
2958  if (Pred == LPred)
2959  DuplicatedToLPred = true;
2960  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
2961  || PredChain == &Chain)
2962  continue;
2963  for (MachineBasicBlock *NewSucc : Pred->successors()) {
2964  if (BlockFilter && !BlockFilter->count(NewSucc))
2965  continue;
2966  BlockChain *NewChain = BlockToChain[NewSucc];
2967  if (NewChain != &Chain && NewChain != PredChain)
2968  NewChain->UnscheduledPredecessors++;
2969  }
2970  }
2971  return Removed;
2972 }
2973 
2974 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
2975  if (skipFunction(MF.getFunction()))
2976  return false;
2977 
2978  // Check for single-block functions and skip them.
2979  if (std::next(MF.begin()) == MF.end())
2980  return false;
2981 
2982  F = &MF;
2983  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2984  MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
2985  getAnalysis<MachineBlockFrequencyInfo>());
2986  MLI = &getAnalysis<MachineLoopInfo>();
2987  TII = MF.getSubtarget().getInstrInfo();
2988  TLI = MF.getSubtarget().getTargetLowering();
2989  MPDT = nullptr;
2990 
2991  // Initialize PreferredLoopExit to nullptr here since it may never be set if
2992  // there are no MachineLoops.
2993  PreferredLoopExit = nullptr;
2994 
2995  assert(BlockToChain.empty() &&
2996  "BlockToChain map should be empty before starting placement.");
2997  assert(ComputedEdges.empty() &&
2998  "Computed Edge map should be empty before starting placement.");
2999 
3000  unsigned TailDupSize = TailDupPlacementThreshold;
3001  // If only the aggressive threshold is explicitly set, use it.
3002  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3003  TailDupPlacementThreshold.getNumOccurrences() == 0)
3005 
3006  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
3007  // For aggressive optimization, we can adjust some thresholds to be less
3008  // conservative.
3009  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
3010  // At O3 we should be more willing to copy blocks for tail duplication. This
3011  // increases size pressure, so we only do it at O3
3012  // Do this unless only the regular threshold is explicitly set.
3013  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3014  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3016  }
3017 
3018  if (allowTailDupPlacement()) {
3019  MPDT = &getAnalysis<MachinePostDominatorTree>();
3020  if (MF.getFunction().hasOptSize())
3021  TailDupSize = 1;
3022  bool PreRegAlloc = false;
3023  TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize);
3024  precomputeTriangleChains();
3025  }
3026 
3027  buildCFGChains();
3028 
3029  // Changing the layout can create new tail merging opportunities.
3030  // TailMerge can create jump into if branches that make CFG irreducible for
3031  // HW that requires structured CFG.
3032  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3033  PassConfig->getEnableTailMerge() &&
3035  // No tail merging opportunities if the block number is less than four.
3036  if (MF.size() > 3 && EnableTailMerge) {
3037  unsigned TailMergeSize = TailDupSize + 1;
3038  BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
3039  *MBPI, TailMergeSize);
3040 
3041  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
3042  getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
3043  /*AfterPlacement=*/true)) {
3044  // Redo the layout if tail merging creates/removes/moves blocks.
3045  BlockToChain.clear();
3046  ComputedEdges.clear();
3047  // Must redo the post-dominator tree if blocks were changed.
3048  if (MPDT)
3049  MPDT->runOnMachineFunction(MF);
3050  ChainAllocator.DestroyAll();
3051  buildCFGChains();
3052  }
3053  }
3054 
3055  optimizeBranches();
3056  alignBlocks();
3057 
3058  BlockToChain.clear();
3059  ComputedEdges.clear();
3060  ChainAllocator.DestroyAll();
3061 
3062  if (AlignAllBlock)
3063  // Align all of the blocks in the function to a specific alignment.
3064  for (MachineBasicBlock &MBB : MF)
3066  else if (AlignAllNonFallThruBlocks) {
3067  // Align all of the blocks that have no fall-through predecessors to a
3068  // specific alignment.
3069  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
3070  auto LayoutPred = std::prev(MBI);
3071  if (!LayoutPred->isSuccessor(&*MBI))
3072  MBI->setAlignment(AlignAllNonFallThruBlocks);
3073  }
3074  }
3075  if (ViewBlockLayoutWithBFI != GVDT_None &&
3076  (ViewBlockFreqFuncName.empty() ||
3077  F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
3078  MBFI->view("MBP." + MF.getName(), false);
3079  }
3080 
3081 
3082  // We always return true as we have no way to track whether the final order
3083  // differs from the original order.
3084  return true;
3085 }
3086 
3087 namespace {
3088 
3089 /// A pass to compute block placement statistics.
3090 ///
3091 /// A separate pass to compute interesting statistics for evaluating block
3092 /// placement. This is separate from the actual placement pass so that they can
3093 /// be computed in the absence of any placement transformations or when using
3094 /// alternative placement strategies.
3095 class MachineBlockPlacementStats : public MachineFunctionPass {
3096  /// A handle to the branch probability pass.
3097  const MachineBranchProbabilityInfo *MBPI;
3098 
3099  /// A handle to the function-wide block frequency pass.
3100  const MachineBlockFrequencyInfo *MBFI;
3101 
3102 public:
3103  static char ID; // Pass identification, replacement for typeid
3104 
3105  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3107  }
3108 
3109  bool runOnMachineFunction(MachineFunction &F) override;
3110 
3111  void getAnalysisUsage(AnalysisUsage &AU) const override {
3114  AU.setPreservesAll();
3116  }
3117 };
3118 
3119 } // end anonymous namespace
3120 
3122 
3124 
3125 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3126  "Basic Block Placement Stats", false, false)
3129 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3130  "Basic Block Placement Stats", false, false)
3131 
3132 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3133  // Check for single-block functions and skip them.
3134  if (std::next(F.begin()) == F.end())
3135  return false;
3136 
3137  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3138  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
3139 
3140  for (MachineBasicBlock &MBB : F) {
3141  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3142  Statistic &NumBranches =
3143  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3144  Statistic &BranchTakenFreq =
3145  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3146  for (MachineBasicBlock *Succ : MBB.successors()) {
3147  // Skip if this successor is a fallthrough.
3148  if (MBB.isLayoutSuccessor(Succ))
3149  continue;
3150 
3151  BlockFrequency EdgeFreq =
3152  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3153  ++NumBranches;
3154  BranchTakenFreq += EdgeFreq.getFrequency();
3155  }
3156  }
3157 
3158  return false;
3159 }
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
BranchProbability getCompl() const
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:320
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:473
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:622
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
unsigned size() const
bool requiresStructuredCFG() const
virtual const TargetLowering * getTargetLowering() const
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const
Insert branch code into the end of the specified MachineBasicBlock.
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:116
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:33
F(f)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static BranchProbability getOne()
virtual unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const
Remove the branching code at the end of the specific MBB.
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void setAlignment(unsigned Align)
Set alignment of the basic block.
CodeGenOpt::Level getOptLevel() const
This file defines the MallocAllocator and BumpPtrAllocator interfaces.
iterator_range< succ_iterator > successors()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:221
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:455
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli=nullptr, bool AfterPlacement=false)
Perhaps branch folding, tail merging and other CFG optimizations on the given function.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
void removeBlock(MachineBasicBlock *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
block placement Basic Block Placement Stats
Target-Independent Code Generator Pass Configuration Options.
BlockT * getHeader() const
Definition: LoopInfo.h:102
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:273
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:462
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
virtual const TargetInstrInfo * getInstrInfo() const
cl::opt< std::string > ViewBlockFreqFuncName
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1258
TargetInstrInfo - Interface to description of machine instruction set.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
cl::opt< unsigned > ProfileLikelyProb
bool getEnableTailMerge() const
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
#define P(N)
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock *> *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
cl::opt< GVDAGType > ViewBlockLayoutWithBFI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:290
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
succ_reverse_iterator succ_rbegin()
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
virtual unsigned getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Represent the analysis usage information of a pass.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency&#39;s with a small penalty for A.
iterator_range< pred_iterator > predecessors()
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:308
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:490
auto remove_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1232
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e...
iterator erase(const_iterator CI)
Definition: SmallVector.h:434
const MachineBasicBlock & front() const
size_t size() const
Definition: SmallVector.h:52
void initializeMachineBlockPlacementPass(PassRegistry &)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1213
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Branch Probability Basic Block Placement
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
size_type size() const
Definition: SmallPtrSet.h:92
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:112
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:297
Iterator for intrusive lists based on ilist_node.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
loop rotate
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
void updateTerminator()
Update the terminator instructions in block to account for changes to the layout. ...
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:441
unsigned pred_size() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
void setPreservesAll()
Set by analyses that do not transform their input at all.
typename SuperClass::iterator iterator
Definition: SmallVector.h:319
unsigned succ_size() const
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
cl::opt< unsigned > StaticLikelyProb
LLVM_NODISCARD bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:160
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:165
unsigned succ_size(const Instruction *I)
Definition: CFG.h:256
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all " "blocks in the function."), cl::init(0), cl::Hidden)
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:151
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock *> &Successors)
Check if BB has exactly the successors in Successors.
#define I(x, y, z)
Definition: MD5.cpp:58
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:619
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
block_iterator block_end() const
Definition: LoopInfo.h:157
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:171
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:211
virtual bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const
Reverses the branch condition of the specified condition list, returning false on success and true if...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void stable_sort(R &&Range)
Definition: STLExtras.h:1316
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:482
#define DEBUG_TYPE
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
static BranchProbability getZero()
Utility class to perform tail duplication.
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
#define LLVM_DEBUG(X)
Definition: Debug.h:122
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
block_iterator block_begin() const
Definition: LoopInfo.h:156
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all " "blocks that have no fall-through predecessors (i.e. don't add " "nops that are executed)."), cl::init(0), cl::Hidden)
uint32_t getNumerator() const
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
This file describes how to lower LLVM code to machine code.