LLVM  9.0.0svn
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements basic block placement transformations using the CFG
10 // structure and branch probability estimates.
11 //
12 // The pass strives to preserve the structure of the CFG (that is, retain
13 // a topological ordering of basic blocks) in the absence of a *strong* signal
14 // to the contrary from probabilities. However, within the CFG structure, it
15 // attempts to choose an ordering which favors placing more likely sequences of
16 // blocks adjacent to each other.
17 //
18 // The algorithm works from the inner-most loop within a function outward, and
19 // at each stage walks through the basic blocks, trying to coalesce them into
20 // sequential chains where allowed by the CFG (or demanded by heavy
21 // probabilities). Finally, it walks the blocks in topological order, and the
22 // first time it reaches a chain of basic blocks, it schedules them in the
23 // function in-order.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "BranchFolding.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
49 #include "llvm/IR/DebugLoc.h"
50 #include "llvm/IR/Function.h"
51 #include "llvm/Pass.h"
52 #include "llvm/Support/Allocator.h"
55 #include "llvm/Support/CodeGen.h"
57 #include "llvm/Support/Compiler.h"
58 #include "llvm/Support/Debug.h"
61 #include <algorithm>
62 #include <cassert>
63 #include <cstdint>
64 #include <iterator>
65 #include <memory>
66 #include <string>
67 #include <tuple>
68 #include <utility>
69 #include <vector>
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "block-placement"
74 
75 STATISTIC(NumCondBranches, "Number of conditional branches");
76 STATISTIC(NumUncondBranches, "Number of unconditional branches");
77 STATISTIC(CondBranchTakenFreq,
78  "Potential frequency of taking conditional branches");
79 STATISTIC(UncondBranchTakenFreq,
80  "Potential frequency of taking unconditional branches");
81 
82 static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
83  cl::desc("Force the alignment of all "
84  "blocks in the function."),
85  cl::init(0), cl::Hidden);
86 
88  "align-all-nofallthru-blocks",
89  cl::desc("Force the alignment of all "
90  "blocks that have no fall-through predecessors (i.e. don't add "
91  "nops that are executed)."),
92  cl::init(0), cl::Hidden);
93 
94 // FIXME: Find a good default for this flag and remove the flag.
96  "block-placement-exit-block-bias",
97  cl::desc("Block frequency percentage a loop exit block needs "
98  "over the original exit to be considered the new exit."),
99  cl::init(0), cl::Hidden);
100 
101 // Definition:
102 // - Outlining: placement of a basic block outside the chain or hot path.
103 
105  "loop-to-cold-block-ratio",
106  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
107  "(frequency of block) is greater than this ratio"),
108  cl::init(5), cl::Hidden);
109 
111  "force-loop-cold-block",
112  cl::desc("Force outlining cold blocks from loops."),
113  cl::init(false), cl::Hidden);
114 
115 static cl::opt<bool>
116  PreciseRotationCost("precise-rotation-cost",
117  cl::desc("Model the cost of loop rotation more "
118  "precisely by using profile data."),
119  cl::init(false), cl::Hidden);
120 
121 static cl::opt<bool>
122  ForcePreciseRotationCost("force-precise-rotation-cost",
123  cl::desc("Force the use of precise cost "
124  "loop rotation strategy."),
125  cl::init(false), cl::Hidden);
126 
128  "misfetch-cost",
129  cl::desc("Cost that models the probabilistic risk of an instruction "
130  "misfetch due to a jump comparing to falling through, whose cost "
131  "is zero."),
132  cl::init(1), cl::Hidden);
133 
134 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
135  cl::desc("Cost of jump instructions."),
136  cl::init(1), cl::Hidden);
137 static cl::opt<bool>
138 TailDupPlacement("tail-dup-placement",
139  cl::desc("Perform tail duplication during placement. "
140  "Creates more fallthrough opportunites in "
141  "outline branches."),
142  cl::init(true), cl::Hidden);
143 
144 static cl::opt<bool>
145 BranchFoldPlacement("branch-fold-placement",
146  cl::desc("Perform branch folding during placement. "
147  "Reduces code size."),
148  cl::init(true), cl::Hidden);
149 
150 // Heuristic for tail duplication.
152  "tail-dup-placement-threshold",
153  cl::desc("Instruction cutoff for tail duplication during layout. "
154  "Tail merging during layout is forced to have a threshold "
155  "that won't conflict."), cl::init(2),
156  cl::Hidden);
157 
158 // Heuristic for aggressive tail duplication.
160  "tail-dup-placement-aggressive-threshold",
161  cl::desc("Instruction cutoff for aggressive tail duplication during "
162  "layout. Used at -O3. Tail merging during layout is forced to "
163  "have a threshold that won't conflict."), cl::init(4),
164  cl::Hidden);
165 
166 // Heuristic for tail duplication.
168  "tail-dup-placement-penalty",
169  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
170  "Copying can increase fallthrough, but it also increases icache "
171  "pressure. This parameter controls the penalty to account for that. "
172  "Percent as integer."),
173  cl::init(2),
174  cl::Hidden);
175 
176 // Heuristic for triangle chains.
178  "triangle-chain-count",
179  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
180  "triangle tail duplication heuristic to kick in. 0 to disable."),
181  cl::init(2),
182  cl::Hidden);
183 
186 
187 // Internal option used to control BFI display only after MBP pass.
188 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
189 // -view-block-layout-with-bfi=
191 
192 // Command line option to specify the name of the function for CFG dump
193 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
195 
196 namespace {
197 
198 class BlockChain;
199 
200 /// Type for our function-wide basic block -> block chain mapping.
201 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
202 
203 /// A chain of blocks which will be laid out contiguously.
204 ///
205 /// This is the datastructure representing a chain of consecutive blocks that
206 /// are profitable to layout together in order to maximize fallthrough
207 /// probabilities and code locality. We also can use a block chain to represent
208 /// a sequence of basic blocks which have some external (correctness)
209 /// requirement for sequential layout.
210 ///
211 /// Chains can be built around a single basic block and can be merged to grow
212 /// them. They participate in a block-to-chain mapping, which is updated
213 /// automatically as chains are merged together.
214 class BlockChain {
215  /// The sequence of blocks belonging to this chain.
216  ///
217  /// This is the sequence of blocks for a particular chain. These will be laid
218  /// out in-order within the function.
220 
221  /// A handle to the function-wide basic block to block chain mapping.
222  ///
223  /// This is retained in each block chain to simplify the computation of child
224  /// block chains for SCC-formation and iteration. We store the edges to child
225  /// basic blocks, and map them back to their associated chains using this
226  /// structure.
227  BlockToChainMapType &BlockToChain;
228 
229 public:
230  /// Construct a new BlockChain.
231  ///
232  /// This builds a new block chain representing a single basic block in the
233  /// function. It also registers itself as the chain that block participates
234  /// in with the BlockToChain mapping.
235  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
236  : Blocks(1, BB), BlockToChain(BlockToChain) {
237  assert(BB && "Cannot create a chain with a null basic block");
238  BlockToChain[BB] = this;
239  }
240 
241  /// Iterator over blocks within the chain.
244 
245  /// Beginning of blocks within the chain.
246  iterator begin() { return Blocks.begin(); }
247  const_iterator begin() const { return Blocks.begin(); }
248 
249  /// End of blocks within the chain.
250  iterator end() { return Blocks.end(); }
251  const_iterator end() const { return Blocks.end(); }
252 
253  bool remove(MachineBasicBlock* BB) {
254  for(iterator i = begin(); i != end(); ++i) {
255  if (*i == BB) {
256  Blocks.erase(i);
257  return true;
258  }
259  }
260  return false;
261  }
262 
263  /// Merge a block chain into this one.
264  ///
265  /// This routine merges a block chain into this one. It takes care of forming
266  /// a contiguous sequence of basic blocks, updating the edge list, and
267  /// updating the block -> chain mapping. It does not free or tear down the
268  /// old chain, but the old chain's block list is no longer valid.
269  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
270  assert(BB && "Can't merge a null block.");
271  assert(!Blocks.empty() && "Can't merge into an empty chain.");
272 
273  // Fast path in case we don't have a chain already.
274  if (!Chain) {
275  assert(!BlockToChain[BB] &&
276  "Passed chain is null, but BB has entry in BlockToChain.");
277  Blocks.push_back(BB);
278  BlockToChain[BB] = this;
279  return;
280  }
281 
282  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
283  assert(Chain->begin() != Chain->end());
284 
285  // Update the incoming blocks to point to this chain, and add them to the
286  // chain structure.
287  for (MachineBasicBlock *ChainBB : *Chain) {
288  Blocks.push_back(ChainBB);
289  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
290  BlockToChain[ChainBB] = this;
291  }
292  }
293 
294 #ifndef NDEBUG
295  /// Dump the blocks in this chain.
296  LLVM_DUMP_METHOD void dump() {
297  for (MachineBasicBlock *MBB : *this)
298  MBB->dump();
299  }
300 #endif // NDEBUG
301 
302  /// Count of predecessors of any block within the chain which have not
303  /// yet been scheduled. In general, we will delay scheduling this chain
304  /// until those predecessors are scheduled (or we find a sufficiently good
305  /// reason to override this heuristic.) Note that when forming loop chains,
306  /// blocks outside the loop are ignored and treated as if they were already
307  /// scheduled.
308  ///
309  /// Note: This field is reinitialized multiple times - once for each loop,
310  /// and then once for the function as a whole.
311  unsigned UnscheduledPredecessors = 0;
312 };
313 
314 class MachineBlockPlacement : public MachineFunctionPass {
315  /// A type for a block filter set.
316  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
317 
318  /// Pair struct containing basic block and taildup profitability
319  struct BlockAndTailDupResult {
320  MachineBasicBlock *BB;
321  bool ShouldTailDup;
322  };
323 
324  /// Triple struct containing edge weight and the edge.
325  struct WeightedEdge {
326  BlockFrequency Weight;
327  MachineBasicBlock *Src;
328  MachineBasicBlock *Dest;
329  };
330 
331  /// work lists of blocks that are ready to be laid out
334 
335  /// Edges that have already been computed as optimal.
337 
338  /// Machine Function
340 
341  /// A handle to the branch probability pass.
342  const MachineBranchProbabilityInfo *MBPI;
343 
344  /// A handle to the function-wide block frequency pass.
345  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
346 
347  /// A handle to the loop info.
348  MachineLoopInfo *MLI;
349 
350  /// Preferred loop exit.
351  /// Member variable for convenience. It may be removed by duplication deep
352  /// in the call stack.
353  MachineBasicBlock *PreferredLoopExit;
354 
355  /// A handle to the target's instruction info.
356  const TargetInstrInfo *TII;
357 
358  /// A handle to the target's lowering info.
359  const TargetLoweringBase *TLI;
360 
361  /// A handle to the post dominator tree.
363 
364  /// Duplicator used to duplicate tails during placement.
365  ///
366  /// Placement decisions can open up new tail duplication opportunities, but
367  /// since tail duplication affects placement decisions of later blocks, it
368  /// must be done inline.
369  TailDuplicator TailDup;
370 
371  /// Allocator and owner of BlockChain structures.
372  ///
373  /// We build BlockChains lazily while processing the loop structure of
374  /// a function. To reduce malloc traffic, we allocate them using this
375  /// slab-like allocator, and destroy them after the pass completes. An
376  /// important guarantee is that this allocator produces stable pointers to
377  /// the chains.
379 
380  /// Function wide BasicBlock to BlockChain mapping.
381  ///
382  /// This mapping allows efficiently moving from any given basic block to the
383  /// BlockChain it participates in, if any. We use it to, among other things,
384  /// allow implicitly defining edges between chains as the existing edges
385  /// between basic blocks.
387 
388 #ifndef NDEBUG
389  /// The set of basic blocks that have terminators that cannot be fully
390  /// analyzed. These basic blocks cannot be re-ordered safely by
391  /// MachineBlockPlacement, and we must preserve physical layout of these
392  /// blocks and their successors through the pass.
393  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
394 #endif
395 
396  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
397  /// if the count goes to 0, add them to the appropriate work list.
398  void markChainSuccessors(
399  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
400  const BlockFilterSet *BlockFilter = nullptr);
401 
402  /// Decrease the UnscheduledPredecessors count for a single block, and
403  /// if the count goes to 0, add them to the appropriate work list.
404  void markBlockSuccessors(
405  const BlockChain &Chain, const MachineBasicBlock *BB,
406  const MachineBasicBlock *LoopHeaderBB,
407  const BlockFilterSet *BlockFilter = nullptr);
408 
410  collectViableSuccessors(
411  const MachineBasicBlock *BB, const BlockChain &Chain,
412  const BlockFilterSet *BlockFilter,
414  bool shouldPredBlockBeOutlined(
415  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
416  const BlockChain &Chain, const BlockFilterSet *BlockFilter,
417  BranchProbability SuccProb, BranchProbability HotProb);
418  bool repeatedlyTailDuplicateBlock(
420  const MachineBasicBlock *LoopHeaderBB,
421  BlockChain &Chain, BlockFilterSet *BlockFilter,
422  MachineFunction::iterator &PrevUnplacedBlockIt);
423  bool maybeTailDuplicateBlock(
425  BlockChain &Chain, BlockFilterSet *BlockFilter,
426  MachineFunction::iterator &PrevUnplacedBlockIt,
427  bool &DuplicatedToLPred);
428  bool hasBetterLayoutPredecessor(
429  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
430  const BlockChain &SuccChain, BranchProbability SuccProb,
431  BranchProbability RealSuccProb, const BlockChain &Chain,
432  const BlockFilterSet *BlockFilter);
433  BlockAndTailDupResult selectBestSuccessor(
434  const MachineBasicBlock *BB, const BlockChain &Chain,
435  const BlockFilterSet *BlockFilter);
436  MachineBasicBlock *selectBestCandidateBlock(
437  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
438  MachineBasicBlock *getFirstUnplacedBlock(
439  const BlockChain &PlacedChain,
440  MachineFunction::iterator &PrevUnplacedBlockIt,
441  const BlockFilterSet *BlockFilter);
442 
443  /// Add a basic block to the work list if it is appropriate.
444  ///
445  /// If the optional parameter BlockFilter is provided, only MBB
446  /// present in the set will be added to the worklist. If nullptr
447  /// is provided, no filtering occurs.
448  void fillWorkLists(const MachineBasicBlock *MBB,
449  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
450  const BlockFilterSet *BlockFilter);
451 
452  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
453  BlockFilterSet *BlockFilter = nullptr);
454  bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
455  const MachineBasicBlock *OldTop);
456  bool hasViableTopFallthrough(const MachineBasicBlock *Top,
457  const BlockFilterSet &LoopBlockSet);
458  MachineBasicBlock *findBestLoopTop(
459  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
460  MachineBasicBlock *findBestLoopExit(
461  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
462  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
463  void buildLoopChains(const MachineLoop &L);
464  void rotateLoop(
465  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
466  const BlockFilterSet &LoopBlockSet);
467  void rotateLoopWithProfile(
468  BlockChain &LoopChain, const MachineLoop &L,
469  const BlockFilterSet &LoopBlockSet);
470  void buildCFGChains();
471  void optimizeBranches();
472  void alignBlocks();
473  /// Returns true if a block should be tail-duplicated to increase fallthrough
474  /// opportunities.
475  bool shouldTailDuplicate(MachineBasicBlock *BB);
476  /// Check the edge frequencies to see if tail duplication will increase
477  /// fallthroughs.
478  bool isProfitableToTailDup(
479  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
480  BranchProbability QProb,
481  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
482 
483  /// Check for a trellis layout.
484  bool isTrellis(const MachineBasicBlock *BB,
485  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
486  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
487 
488  /// Get the best successor given a trellis layout.
489  BlockAndTailDupResult getBestTrellisSuccessor(
490  const MachineBasicBlock *BB,
491  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
492  BranchProbability AdjustedSumProb, const BlockChain &Chain,
493  const BlockFilterSet *BlockFilter);
494 
495  /// Get the best pair of non-conflicting edges.
496  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
497  const MachineBasicBlock *BB,
499 
500  /// Returns true if a block can tail duplicate into all unplaced
501  /// predecessors. Filters based on loop.
502  bool canTailDuplicateUnplacedPreds(
503  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
504  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
505 
506  /// Find chains of triangles to tail-duplicate where a global analysis works,
507  /// but a local analysis would not find them.
508  void precomputeTriangleChains();
509 
510 public:
511  static char ID; // Pass identification, replacement for typeid
512 
513  MachineBlockPlacement() : MachineFunctionPass(ID) {
515  }
516 
517  bool runOnMachineFunction(MachineFunction &F) override;
518 
519  bool allowTailDupPlacement() const {
520  assert(F);
522  }
523 
524  void getAnalysisUsage(AnalysisUsage &AU) const override {
527  if (TailDupPlacement)
532  }
533 };
534 
535 } // end anonymous namespace
536 
538 
540 
541 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
542  "Branch Probability Basic Block Placement", false, false)
547 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
548  "Branch Probability Basic Block Placement", false, false)
549 
550 #ifndef NDEBUG
551 /// Helper to print the name of a MBB.
552 ///
553 /// Only used by debug logging.
554 static std::string getBlockName(const MachineBasicBlock *BB) {
555  std::string Result;
556  raw_string_ostream OS(Result);
557  OS << printMBBReference(*BB);
558  OS << " ('" << BB->getName() << "')";
559  OS.flush();
560  return Result;
561 }
562 #endif
563 
564 /// Mark a chain's successors as having one fewer preds.
565 ///
566 /// When a chain is being merged into the "placed" chain, this routine will
567 /// quickly walk the successors of each block in the chain and mark them as
568 /// having one fewer active predecessor. It also adds any successors of this
569 /// chain which reach the zero-predecessor state to the appropriate worklist.
570 void MachineBlockPlacement::markChainSuccessors(
571  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
572  const BlockFilterSet *BlockFilter) {
573  // Walk all the blocks in this chain, marking their successors as having
574  // a predecessor placed.
575  for (MachineBasicBlock *MBB : Chain) {
576  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
577  }
578 }
579 
580 /// Mark a single block's successors as having one fewer preds.
581 ///
582 /// Under normal circumstances, this is only called by markChainSuccessors,
583 /// but if a block that was to be placed is completely tail-duplicated away,
584 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
585 /// for just that block.
586 void MachineBlockPlacement::markBlockSuccessors(
587  const BlockChain &Chain, const MachineBasicBlock *MBB,
588  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
589  // Add any successors for which this is the only un-placed in-loop
590  // predecessor to the worklist as a viable candidate for CFG-neutral
591  // placement. No subsequent placement of this block will violate the CFG
592  // shape, so we get to use heuristics to choose a favorable placement.
593  for (MachineBasicBlock *Succ : MBB->successors()) {
594  if (BlockFilter && !BlockFilter->count(Succ))
595  continue;
596  BlockChain &SuccChain = *BlockToChain[Succ];
597  // Disregard edges within a fixed chain, or edges to the loop header.
598  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
599  continue;
600 
601  // This is a cross-chain edge that is within the loop, so decrement the
602  // loop predecessor count of the destination chain.
603  if (SuccChain.UnscheduledPredecessors == 0 ||
604  --SuccChain.UnscheduledPredecessors > 0)
605  continue;
606 
607  auto *NewBB = *SuccChain.begin();
608  if (NewBB->isEHPad())
609  EHPadWorkList.push_back(NewBB);
610  else
611  BlockWorkList.push_back(NewBB);
612  }
613 }
614 
615 /// This helper function collects the set of successors of block
616 /// \p BB that are allowed to be its layout successors, and return
617 /// the total branch probability of edges from \p BB to those
618 /// blocks.
619 BranchProbability MachineBlockPlacement::collectViableSuccessors(
620  const MachineBasicBlock *BB, const BlockChain &Chain,
621  const BlockFilterSet *BlockFilter,
623  // Adjust edge probabilities by excluding edges pointing to blocks that is
624  // either not in BlockFilter or is already in the current chain. Consider the
625  // following CFG:
626  //
627  // --->A
628  // | / \
629  // | B C
630  // | \ / \
631  // ----D E
632  //
633  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
634  // A->C is chosen as a fall-through, D won't be selected as a successor of C
635  // due to CFG constraint (the probability of C->D is not greater than
636  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
637  // when calculating the probability of C->D, D will be selected and we
638  // will get A C D B as the layout of this loop.
639  auto AdjustedSumProb = BranchProbability::getOne();
640  for (MachineBasicBlock *Succ : BB->successors()) {
641  bool SkipSucc = false;
642  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
643  SkipSucc = true;
644  } else {
645  BlockChain *SuccChain = BlockToChain[Succ];
646  if (SuccChain == &Chain) {
647  SkipSucc = true;
648  } else if (Succ != *SuccChain->begin()) {
649  LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
650  << " -> Mid chain!\n");
651  continue;
652  }
653  }
654  if (SkipSucc)
655  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
656  else
657  Successors.push_back(Succ);
658  }
659 
660  return AdjustedSumProb;
661 }
662 
663 /// The helper function returns the branch probability that is adjusted
664 /// or normalized over the new total \p AdjustedSumProb.
665 static BranchProbability
667  BranchProbability AdjustedSumProb) {
668  BranchProbability SuccProb;
669  uint32_t SuccProbN = OrigProb.getNumerator();
670  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
671  if (SuccProbN >= SuccProbD)
672  SuccProb = BranchProbability::getOne();
673  else
674  SuccProb = BranchProbability(SuccProbN, SuccProbD);
675 
676  return SuccProb;
677 }
678 
679 /// Check if \p BB has exactly the successors in \p Successors.
680 static bool
683  if (BB.succ_size() != Successors.size())
684  return false;
685  // We don't want to count self-loops
686  if (Successors.count(&BB))
687  return false;
688  for (MachineBasicBlock *Succ : BB.successors())
689  if (!Successors.count(Succ))
690  return false;
691  return true;
692 }
693 
694 /// Check if a block should be tail duplicated to increase fallthrough
695 /// opportunities.
696 /// \p BB Block to check.
697 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
698  // Blocks with single successors don't create additional fallthrough
699  // opportunities. Don't duplicate them. TODO: When conditional exits are
700  // analyzable, allow them to be duplicated.
701  bool IsSimple = TailDup.isSimpleBB(BB);
702 
703  if (BB->succ_size() == 1)
704  return false;
705  return TailDup.shouldTailDuplicate(IsSimple, *BB);
706 }
707 
708 /// Compare 2 BlockFrequency's with a small penalty for \p A.
709 /// In order to be conservative, we apply a X% penalty to account for
710 /// increased icache pressure and static heuristics. For small frequencies
711 /// we use only the numerators to improve accuracy. For simplicity, we assume the
712 /// penalty is less than 100%
713 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
715  uint64_t EntryFreq) {
716  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
717  BlockFrequency Gain = A - B;
718  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
719 }
720 
721 /// Check the edge frequencies to see if tail duplication will increase
722 /// fallthroughs. It only makes sense to call this function when
723 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
724 /// always locally profitable if we would have picked \p Succ without
725 /// considering duplication.
726 bool MachineBlockPlacement::isProfitableToTailDup(
727  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
728  BranchProbability QProb,
729  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
730  // We need to do a probability calculation to make sure this is profitable.
731  // First: does succ have a successor that post-dominates? This affects the
732  // calculation. The 2 relevant cases are:
733  // BB BB
734  // | \Qout | \Qout
735  // P| C |P C
736  // = C' = C'
737  // | /Qin | /Qin
738  // | / | /
739  // Succ Succ
740  // / \ | \ V
741  // U/ =V |U \
742  // / \ = D
743  // D E | /
744  // | /
745  // |/
746  // PDom
747  // '=' : Branch taken for that CFG edge
748  // In the second case, Placing Succ while duplicating it into C prevents the
749  // fallthrough of Succ into either D or PDom, because they now have C as an
750  // unplaced predecessor
751 
752  // Start by figuring out which case we fall into
753  MachineBasicBlock *PDom = nullptr;
755  // Only scan the relevant successors
756  auto AdjustedSuccSumProb =
757  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
758  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
759  auto BBFreq = MBFI->getBlockFreq(BB);
760  auto SuccFreq = MBFI->getBlockFreq(Succ);
761  BlockFrequency P = BBFreq * PProb;
762  BlockFrequency Qout = BBFreq * QProb;
763  uint64_t EntryFreq = MBFI->getEntryFreq();
764  // If there are no more successors, it is profitable to copy, as it strictly
765  // increases fallthrough.
766  if (SuccSuccs.size() == 0)
767  return greaterWithBias(P, Qout, EntryFreq);
768 
769  auto BestSuccSucc = BranchProbability::getZero();
770  // Find the PDom or the best Succ if no PDom exists.
771  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
772  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
773  if (Prob > BestSuccSucc)
774  BestSuccSucc = Prob;
775  if (PDom == nullptr)
776  if (MPDT->dominates(SuccSucc, Succ)) {
777  PDom = SuccSucc;
778  break;
779  }
780  }
781  // For the comparisons, we need to know Succ's best incoming edge that isn't
782  // from BB.
783  auto SuccBestPred = BlockFrequency(0);
784  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
785  if (SuccPred == Succ || SuccPred == BB
786  || BlockToChain[SuccPred] == &Chain
787  || (BlockFilter && !BlockFilter->count(SuccPred)))
788  continue;
789  auto Freq = MBFI->getBlockFreq(SuccPred)
790  * MBPI->getEdgeProbability(SuccPred, Succ);
791  if (Freq > SuccBestPred)
792  SuccBestPred = Freq;
793  }
794  // Qin is Succ's best unplaced incoming edge that isn't BB
795  BlockFrequency Qin = SuccBestPred;
796  // If it doesn't have a post-dominating successor, here is the calculation:
797  // BB BB
798  // | \Qout | \
799  // P| C | =
800  // = C' | C
801  // | /Qin | |
802  // | / | C' (+Succ)
803  // Succ Succ /|
804  // / \ | \/ |
805  // U/ =V | == |
806  // / \ | / \|
807  // D E D E
808  // '=' : Branch taken for that CFG edge
809  // Cost in the first case is: P + V
810  // For this calculation, we always assume P > Qout. If Qout > P
811  // The result of this function will be ignored at the caller.
812  // Let F = SuccFreq - Qin
813  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
814 
815  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
816  BranchProbability UProb = BestSuccSucc;
817  BranchProbability VProb = AdjustedSuccSumProb - UProb;
818  BlockFrequency F = SuccFreq - Qin;
819  BlockFrequency V = SuccFreq * VProb;
820  BlockFrequency QinU = std::min(Qin, F) * UProb;
821  BlockFrequency BaseCost = P + V;
822  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
823  return greaterWithBias(BaseCost, DupCost, EntryFreq);
824  }
825  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
826  BranchProbability VProb = AdjustedSuccSumProb - UProb;
827  BlockFrequency U = SuccFreq * UProb;
828  BlockFrequency V = SuccFreq * VProb;
829  BlockFrequency F = SuccFreq - Qin;
830  // If there is a post-dominating successor, here is the calculation:
831  // BB BB BB BB
832  // | \Qout | \ | \Qout | \
833  // |P C | = |P C | =
834  // = C' |P C = C' |P C
835  // | /Qin | | | /Qin | |
836  // | / | C' (+Succ) | / | C' (+Succ)
837  // Succ Succ /| Succ Succ /|
838  // | \ V | \/ | | \ V | \/ |
839  // |U \ |U /\ =? |U = |U /\ |
840  // = D = = =?| | D | = =|
841  // | / |/ D | / |/ D
842  // | / | / | = | /
843  // |/ | / |/ | =
844  // Dom Dom Dom Dom
845  // '=' : Branch taken for that CFG edge
846  // The cost for taken branches in the first case is P + U
847  // Let F = SuccFreq - Qin
848  // The cost in the second case (assuming independence), given the layout:
849  // BB, Succ, (C+Succ), D, Dom or the layout:
850  // BB, Succ, D, Dom, (C+Succ)
851  // is Qout + max(F, Qin) * U + min(F, Qin)
852  // compare P + U vs Qout + P * U + Qin.
853  //
854  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
855  //
856  // For the 3rd case, the cost is P + 2 * V
857  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
858  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
859  if (UProb > AdjustedSuccSumProb / 2 &&
860  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
861  Chain, BlockFilter))
862  // Cases 3 & 4
863  return greaterWithBias(
864  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
865  EntryFreq);
866  // Cases 1 & 2
867  return greaterWithBias((P + U),
868  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
869  std::max(Qin, F) * UProb),
870  EntryFreq);
871 }
872 
873 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
874 /// successors form the lower part of a trellis. A successor set S forms the
875 /// lower part of a trellis if all of the predecessors of S are either in S or
876 /// have all of S as successors. We ignore trellises where BB doesn't have 2
877 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
878 /// are very uncommon and complex to compute optimally. Allowing edges within S
879 /// is not strictly a trellis, but the same algorithm works, so we allow it.
880 bool MachineBlockPlacement::isTrellis(
881  const MachineBasicBlock *BB,
882  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
883  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
884  // Technically BB could form a trellis with branching factor higher than 2.
885  // But that's extremely uncommon.
886  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
887  return false;
888 
890  BB->succ_end());
891  // To avoid reviewing the same predecessors twice.
893 
894  for (MachineBasicBlock *Succ : ViableSuccs) {
895  int PredCount = 0;
896  for (auto SuccPred : Succ->predecessors()) {
897  // Allow triangle successors, but don't count them.
898  if (Successors.count(SuccPred)) {
899  // Make sure that it is actually a triangle.
900  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
901  if (!Successors.count(CheckSucc))
902  return false;
903  continue;
904  }
905  const BlockChain *PredChain = BlockToChain[SuccPred];
906  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
907  PredChain == &Chain || PredChain == BlockToChain[Succ])
908  continue;
909  ++PredCount;
910  // Perform the successor check only once.
911  if (!SeenPreds.insert(SuccPred).second)
912  continue;
913  if (!hasSameSuccessors(*SuccPred, Successors))
914  return false;
915  }
916  // If one of the successors has only BB as a predecessor, it is not a
917  // trellis.
918  if (PredCount < 1)
919  return false;
920  }
921  return true;
922 }
923 
924 /// Pick the highest total weight pair of edges that can both be laid out.
925 /// The edges in \p Edges[0] are assumed to have a different destination than
926 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
927 /// the individual highest weight edges to the 2 different destinations, or in
928 /// case of a conflict, one of them should be replaced with a 2nd best edge.
929 std::pair<MachineBlockPlacement::WeightedEdge,
930  MachineBlockPlacement::WeightedEdge>
931 MachineBlockPlacement::getBestNonConflictingEdges(
932  const MachineBasicBlock *BB,
934  Edges) {
935  // Sort the edges, and then for each successor, find the best incoming
936  // predecessor. If the best incoming predecessors aren't the same,
937  // then that is clearly the best layout. If there is a conflict, one of the
938  // successors will have to fallthrough from the second best predecessor. We
939  // compare which combination is better overall.
940 
941  // Sort for highest frequency.
942  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
943 
944  llvm::stable_sort(Edges[0], Cmp);
945  llvm::stable_sort(Edges[1], Cmp);
946  auto BestA = Edges[0].begin();
947  auto BestB = Edges[1].begin();
948  // Arrange for the correct answer to be in BestA and BestB
949  // If the 2 best edges don't conflict, the answer is already there.
950  if (BestA->Src == BestB->Src) {
951  // Compare the total fallthrough of (Best + Second Best) for both pairs
952  auto SecondBestA = std::next(BestA);
953  auto SecondBestB = std::next(BestB);
954  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
955  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
956  if (BestAScore < BestBScore)
957  BestA = SecondBestA;
958  else
959  BestB = SecondBestB;
960  }
961  // Arrange for the BB edge to be in BestA if it exists.
962  if (BestB->Src == BB)
963  std::swap(BestA, BestB);
964  return std::make_pair(*BestA, *BestB);
965 }
966 
967 /// Get the best successor from \p BB based on \p BB being part of a trellis.
968 /// We only handle trellises with 2 successors, so the algorithm is
969 /// straightforward: Find the best pair of edges that don't conflict. We find
970 /// the best incoming edge for each successor in the trellis. If those conflict,
971 /// we consider which of them should be replaced with the second best.
972 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
973 /// comes from \p BB, it will be in \p BestEdges[0]
974 MachineBlockPlacement::BlockAndTailDupResult
975 MachineBlockPlacement::getBestTrellisSuccessor(
976  const MachineBasicBlock *BB,
977  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
978  BranchProbability AdjustedSumProb, const BlockChain &Chain,
979  const BlockFilterSet *BlockFilter) {
980 
981  BlockAndTailDupResult Result = {nullptr, false};
983  BB->succ_end());
984 
985  // We assume size 2 because it's common. For general n, we would have to do
986  // the Hungarian algorithm, but it's not worth the complexity because more
987  // than 2 successors is fairly uncommon, and a trellis even more so.
988  if (Successors.size() != 2 || ViableSuccs.size() != 2)
989  return Result;
990 
991  // Collect the edge frequencies of all edges that form the trellis.
993  int SuccIndex = 0;
994  for (auto Succ : ViableSuccs) {
995  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
996  // Skip any placed predecessors that are not BB
997  if (SuccPred != BB)
998  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
999  BlockToChain[SuccPred] == &Chain ||
1000  BlockToChain[SuccPred] == BlockToChain[Succ])
1001  continue;
1002  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1003  MBPI->getEdgeProbability(SuccPred, Succ);
1004  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1005  }
1006  ++SuccIndex;
1007  }
1008 
1009  // Pick the best combination of 2 edges from all the edges in the trellis.
1010  WeightedEdge BestA, BestB;
1011  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1012 
1013  if (BestA.Src != BB) {
1014  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1015  // we shouldn't choose any successor. We've already looked and there's a
1016  // better fallthrough edge for all the successors.
1017  LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1018  return Result;
1019  }
1020 
1021  // Did we pick the triangle edge? If tail-duplication is profitable, do
1022  // that instead. Otherwise merge the triangle edge now while we know it is
1023  // optimal.
1024  if (BestA.Dest == BestB.Src) {
1025  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1026  // would be better.
1027  MachineBasicBlock *Succ1 = BestA.Dest;
1028  MachineBasicBlock *Succ2 = BestB.Dest;
1029  // Check to see if tail-duplication would be profitable.
1030  if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1031  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1032  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1033  Chain, BlockFilter)) {
1035  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1036  dbgs() << " Selected: " << getBlockName(Succ2)
1037  << ", probability: " << Succ2Prob
1038  << " (Tail Duplicate)\n");
1039  Result.BB = Succ2;
1040  Result.ShouldTailDup = true;
1041  return Result;
1042  }
1043  }
1044  // We have already computed the optimal edge for the other side of the
1045  // trellis.
1046  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1047 
1048  auto TrellisSucc = BestA.Dest;
1050  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1051  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1052  << ", probability: " << SuccProb << " (Trellis)\n");
1053  Result.BB = TrellisSucc;
1054  return Result;
1055 }
1056 
1057 /// When the option allowTailDupPlacement() is on, this method checks if the
1058 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1059 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1060 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1061  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1062  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1063  if (!shouldTailDuplicate(Succ))
1064  return false;
1065 
1066  // For CFG checking.
1068  BB->succ_end());
1069  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1070  // Make sure all unplaced and unfiltered predecessors can be
1071  // tail-duplicated into.
1072  // Skip any blocks that are already placed or not in this loop.
1073  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1074  || BlockToChain[Pred] == &Chain)
1075  continue;
1076  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1077  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1078  // This will result in a trellis after tail duplication, so we don't
1079  // need to copy Succ into this predecessor. In the presence
1080  // of a trellis tail duplication can continue to be profitable.
1081  // For example:
1082  // A A
1083  // |\ |\
1084  // | \ | \
1085  // | C | C+BB
1086  // | / | |
1087  // |/ | |
1088  // BB => BB |
1089  // |\ |\/|
1090  // | \ |/\|
1091  // | D | D
1092  // | / | /
1093  // |/ |/
1094  // Succ Succ
1095  //
1096  // After BB was duplicated into C, the layout looks like the one on the
1097  // right. BB and C now have the same successors. When considering
1098  // whether Succ can be duplicated into all its unplaced predecessors, we
1099  // ignore C.
1100  // We can do this because C already has a profitable fallthrough, namely
1101  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1102  // duplication and for this test.
1103  //
1104  // This allows trellises to be laid out in 2 separate chains
1105  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1106  // because it allows the creation of 2 fallthrough paths with links
1107  // between them, and we correctly identify the best layout for these
1108  // CFGs. We want to extend trellises that the user created in addition
1109  // to trellises created by tail-duplication, so we just look for the
1110  // CFG.
1111  continue;
1112  return false;
1113  }
1114  }
1115  return true;
1116 }
1117 
1118 /// Find chains of triangles where we believe it would be profitable to
1119 /// tail-duplicate them all, but a local analysis would not find them.
1120 /// There are 3 ways this can be profitable:
1121 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1122 /// longer chains)
1123 /// 2) The chains are statically correlated. Branch probabilities have a very
1124 /// U-shaped distribution.
1125 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1126 /// If the branches in a chain are likely to be from the same side of the
1127 /// distribution as their predecessor, but are independent at runtime, this
1128 /// transformation is profitable. (Because the cost of being wrong is a small
1129 /// fixed cost, unlike the standard triangle layout where the cost of being
1130 /// wrong scales with the # of triangles.)
1131 /// 3) The chains are dynamically correlated. If the probability that a previous
1132 /// branch was taken positively influences whether the next branch will be
1133 /// taken
1134 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1135 void MachineBlockPlacement::precomputeTriangleChains() {
1136  struct TriangleChain {
1137  std::vector<MachineBasicBlock *> Edges;
1138 
1139  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1140  : Edges({src, dst}) {}
1141 
1142  void append(MachineBasicBlock *dst) {
1143  assert(getKey()->isSuccessor(dst) &&
1144  "Attempting to append a block that is not a successor.");
1145  Edges.push_back(dst);
1146  }
1147 
1148  unsigned count() const { return Edges.size() - 1; }
1149 
1150  MachineBasicBlock *getKey() const {
1151  return Edges.back();
1152  }
1153  };
1154 
1155  if (TriangleChainCount == 0)
1156  return;
1157 
1158  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1159  // Map from last block to the chain that contains it. This allows us to extend
1160  // chains as we find new triangles.
1162  for (MachineBasicBlock &BB : *F) {
1163  // If BB doesn't have 2 successors, it doesn't start a triangle.
1164  if (BB.succ_size() != 2)
1165  continue;
1166  MachineBasicBlock *PDom = nullptr;
1167  for (MachineBasicBlock *Succ : BB.successors()) {
1168  if (!MPDT->dominates(Succ, &BB))
1169  continue;
1170  PDom = Succ;
1171  break;
1172  }
1173  // If BB doesn't have a post-dominating successor, it doesn't form a
1174  // triangle.
1175  if (PDom == nullptr)
1176  continue;
1177  // If PDom has a hint that it is low probability, skip this triangle.
1178  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1179  continue;
1180  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1181  // we're looking for.
1182  if (!shouldTailDuplicate(PDom))
1183  continue;
1184  bool CanTailDuplicate = true;
1185  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1186  // isn't the kind of triangle we're looking for.
1187  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1188  if (Pred == &BB)
1189  continue;
1190  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1191  CanTailDuplicate = false;
1192  break;
1193  }
1194  }
1195  // If we can't tail-duplicate PDom to its predecessors, then skip this
1196  // triangle.
1197  if (!CanTailDuplicate)
1198  continue;
1199 
1200  // Now we have an interesting triangle. Insert it if it's not part of an
1201  // existing chain.
1202  // Note: This cannot be replaced with a call insert() or emplace() because
1203  // the find key is BB, but the insert/emplace key is PDom.
1204  auto Found = TriangleChainMap.find(&BB);
1205  // If it is, remove the chain from the map, grow it, and put it back in the
1206  // map with the end as the new key.
1207  if (Found != TriangleChainMap.end()) {
1208  TriangleChain Chain = std::move(Found->second);
1209  TriangleChainMap.erase(Found);
1210  Chain.append(PDom);
1211  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1212  } else {
1213  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1214  assert(InsertResult.second && "Block seen twice.");
1215  (void)InsertResult;
1216  }
1217  }
1218 
1219  // Iterating over a DenseMap is safe here, because the only thing in the body
1220  // of the loop is inserting into another DenseMap (ComputedEdges).
1221  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1222  for (auto &ChainPair : TriangleChainMap) {
1223  TriangleChain &Chain = ChainPair.second;
1224  // Benchmarking has shown that due to branch correlation duplicating 2 or
1225  // more triangles is profitable, despite the calculations assuming
1226  // independence.
1227  if (Chain.count() < TriangleChainCount)
1228  continue;
1229  MachineBasicBlock *dst = Chain.Edges.back();
1230  Chain.Edges.pop_back();
1231  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1232  LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1233  << getBlockName(dst)
1234  << " as pre-computed based on triangles.\n");
1235 
1236  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1237  assert(InsertResult.second && "Block seen twice.");
1238  (void)InsertResult;
1239 
1240  dst = src;
1241  }
1242  }
1243 }
1244 
1245 // When profile is not present, return the StaticLikelyProb.
1246 // When profile is available, we need to handle the triangle-shape CFG.
1248  const MachineBasicBlock *BB) {
1249  if (!BB->getParent()->getFunction().hasProfileData())
1250  return BranchProbability(StaticLikelyProb, 100);
1251  if (BB->succ_size() == 2) {
1252  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1253  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1254  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1255  /* See case 1 below for the cost analysis. For BB->Succ to
1256  * be taken with smaller cost, the following needs to hold:
1257  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1258  * So the threshold T in the calculation below
1259  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1260  * So T / (1 - T) = 2, Yielding T = 2/3
1261  * Also adding user specified branch bias, we have
1262  * T = (2/3)*(ProfileLikelyProb/50)
1263  * = (2*ProfileLikelyProb)/150)
1264  */
1265  return BranchProbability(2 * ProfileLikelyProb, 150);
1266  }
1267  }
1268  return BranchProbability(ProfileLikelyProb, 100);
1269 }
1270 
1271 /// Checks to see if the layout candidate block \p Succ has a better layout
1272 /// predecessor than \c BB. If yes, returns true.
1273 /// \p SuccProb: The probability adjusted for only remaining blocks.
1274 /// Only used for logging
1275 /// \p RealSuccProb: The un-adjusted probability.
1276 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1277 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1278 /// considered
1279 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1280  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1281  const BlockChain &SuccChain, BranchProbability SuccProb,
1282  BranchProbability RealSuccProb, const BlockChain &Chain,
1283  const BlockFilterSet *BlockFilter) {
1284 
1285  // There isn't a better layout when there are no unscheduled predecessors.
1286  if (SuccChain.UnscheduledPredecessors == 0)
1287  return false;
1288 
1289  // There are two basic scenarios here:
1290  // -------------------------------------
1291  // Case 1: triangular shape CFG (if-then):
1292  // BB
1293  // | \
1294  // | \
1295  // | Pred
1296  // | /
1297  // Succ
1298  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1299  // set Succ as the layout successor of BB. Picking Succ as BB's
1300  // successor breaks the CFG constraints (FIXME: define these constraints).
1301  // With this layout, Pred BB
1302  // is forced to be outlined, so the overall cost will be cost of the
1303  // branch taken from BB to Pred, plus the cost of back taken branch
1304  // from Pred to Succ, as well as the additional cost associated
1305  // with the needed unconditional jump instruction from Pred To Succ.
1306 
1307  // The cost of the topological order layout is the taken branch cost
1308  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1309  // must hold:
1310  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1311  // < freq(BB->Succ) * taken_branch_cost.
1312  // Ignoring unconditional jump cost, we get
1313  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1314  // prob(BB->Succ) > 2 * prob(BB->Pred)
1315  //
1316  // When real profile data is available, we can precisely compute the
1317  // probability threshold that is needed for edge BB->Succ to be considered.
1318  // Without profile data, the heuristic requires the branch bias to be
1319  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1320  // -----------------------------------------------------------------
1321  // Case 2: diamond like CFG (if-then-else):
1322  // S
1323  // / \
1324  // | \
1325  // BB Pred
1326  // \ /
1327  // Succ
1328  // ..
1329  //
1330  // The current block is BB and edge BB->Succ is now being evaluated.
1331  // Note that edge S->BB was previously already selected because
1332  // prob(S->BB) > prob(S->Pred).
1333  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1334  // choose Pred, we will have a topological ordering as shown on the left
1335  // in the picture below. If we choose Succ, we have the solution as shown
1336  // on the right:
1337  //
1338  // topo-order:
1339  //
1340  // S----- ---S
1341  // | | | |
1342  // ---BB | | BB
1343  // | | | |
1344  // | Pred-- | Succ--
1345  // | | | |
1346  // ---Succ ---Pred--
1347  //
1348  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1349  // = freq(S->Pred) + freq(S->BB)
1350  //
1351  // If we have profile data (i.e, branch probabilities can be trusted), the
1352  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1353  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1354  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1355  // means the cost of topological order is greater.
1356  // When profile data is not available, however, we need to be more
1357  // conservative. If the branch prediction is wrong, breaking the topo-order
1358  // will actually yield a layout with large cost. For this reason, we need
1359  // strong biased branch at block S with Prob(S->BB) in order to select
1360  // BB->Succ. This is equivalent to looking the CFG backward with backward
1361  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1362  // profile data).
1363  // --------------------------------------------------------------------------
1364  // Case 3: forked diamond
1365  // S
1366  // / \
1367  // / \
1368  // BB Pred
1369  // | \ / |
1370  // | \ / |
1371  // | X |
1372  // | / \ |
1373  // | / \ |
1374  // S1 S2
1375  //
1376  // The current block is BB and edge BB->S1 is now being evaluated.
1377  // As above S->BB was already selected because
1378  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1379  //
1380  // topo-order:
1381  //
1382  // S-------| ---S
1383  // | | | |
1384  // ---BB | | BB
1385  // | | | |
1386  // | Pred----| | S1----
1387  // | | | |
1388  // --(S1 or S2) ---Pred--
1389  // |
1390  // S2
1391  //
1392  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1393  // + min(freq(Pred->S1), freq(Pred->S2))
1394  // Non-topo-order cost:
1395  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1396  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1397  // is 0. Then the non topo layout is better when
1398  // freq(S->Pred) < freq(BB->S1).
1399  // This is exactly what is checked below.
1400  // Note there are other shapes that apply (Pred may not be a single block,
1401  // but they all fit this general pattern.)
1403 
1404  // Make sure that a hot successor doesn't have a globally more
1405  // important predecessor.
1406  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1407  bool BadCFGConflict = false;
1408 
1409  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1410  if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
1411  (BlockFilter && !BlockFilter->count(Pred)) ||
1412  BlockToChain[Pred] == &Chain ||
1413  // This check is redundant except for look ahead. This function is
1414  // called for lookahead by isProfitableToTailDup when BB hasn't been
1415  // placed yet.
1416  (Pred == BB))
1417  continue;
1418  // Do backward checking.
1419  // For all cases above, we need a backward checking to filter out edges that
1420  // are not 'strongly' biased.
1421  // BB Pred
1422  // \ /
1423  // Succ
1424  // We select edge BB->Succ if
1425  // freq(BB->Succ) > freq(Succ) * HotProb
1426  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1427  // HotProb
1428  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1429  // Case 1 is covered too, because the first equation reduces to:
1430  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1431  BlockFrequency PredEdgeFreq =
1432  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1433  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1434  BadCFGConflict = true;
1435  break;
1436  }
1437  }
1438 
1439  if (BadCFGConflict) {
1440  LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1441  << SuccProb << " (prob) (non-cold CFG conflict)\n");
1442  return true;
1443  }
1444 
1445  return false;
1446 }
1447 
1448 /// Select the best successor for a block.
1449 ///
1450 /// This looks across all successors of a particular block and attempts to
1451 /// select the "best" one to be the layout successor. It only considers direct
1452 /// successors which also pass the block filter. It will attempt to avoid
1453 /// breaking CFG structure, but cave and break such structures in the case of
1454 /// very hot successor edges.
1455 ///
1456 /// \returns The best successor block found, or null if none are viable, along
1457 /// with a boolean indicating if tail duplication is necessary.
1458 MachineBlockPlacement::BlockAndTailDupResult
1459 MachineBlockPlacement::selectBestSuccessor(
1460  const MachineBasicBlock *BB, const BlockChain &Chain,
1461  const BlockFilterSet *BlockFilter) {
1462  const BranchProbability HotProb(StaticLikelyProb, 100);
1463 
1464  BlockAndTailDupResult BestSucc = { nullptr, false };
1465  auto BestProb = BranchProbability::getZero();
1466 
1468  auto AdjustedSumProb =
1469  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1470 
1471  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1472  << "\n");
1473 
1474  // if we already precomputed the best successor for BB, return that if still
1475  // applicable.
1476  auto FoundEdge = ComputedEdges.find(BB);
1477  if (FoundEdge != ComputedEdges.end()) {
1478  MachineBasicBlock *Succ = FoundEdge->second.BB;
1479  ComputedEdges.erase(FoundEdge);
1480  BlockChain *SuccChain = BlockToChain[Succ];
1481  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1482  SuccChain != &Chain && Succ == *SuccChain->begin())
1483  return FoundEdge->second;
1484  }
1485 
1486  // if BB is part of a trellis, Use the trellis to determine the optimal
1487  // fallthrough edges
1488  if (isTrellis(BB, Successors, Chain, BlockFilter))
1489  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1490  BlockFilter);
1491 
1492  // For blocks with CFG violations, we may be able to lay them out anyway with
1493  // tail-duplication. We keep this vector so we can perform the probability
1494  // calculations the minimum number of times.
1496  DupCandidates;
1497  for (MachineBasicBlock *Succ : Successors) {
1498  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1499  BranchProbability SuccProb =
1500  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1501 
1502  BlockChain &SuccChain = *BlockToChain[Succ];
1503  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1504  // predecessor that yields lower global cost.
1505  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1506  Chain, BlockFilter)) {
1507  // If tail duplication would make Succ profitable, place it.
1508  if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1509  DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
1510  continue;
1511  }
1512 
1513  LLVM_DEBUG(
1514  dbgs() << " Candidate: " << getBlockName(Succ)
1515  << ", probability: " << SuccProb
1516  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1517  << "\n");
1518 
1519  if (BestSucc.BB && BestProb >= SuccProb) {
1520  LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1521  continue;
1522  }
1523 
1524  LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1525  BestSucc.BB = Succ;
1526  BestProb = SuccProb;
1527  }
1528  // Handle the tail duplication candidates in order of decreasing probability.
1529  // Stop at the first one that is profitable. Also stop if they are less
1530  // profitable than BestSucc. Position is important because we preserve it and
1531  // prefer first best match. Here we aren't comparing in order, so we capture
1532  // the position instead.
1533  llvm::stable_sort(DupCandidates,
1534  [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1535  std::tuple<BranchProbability, MachineBasicBlock *> R) {
1536  return std::get<0>(L) > std::get<0>(R);
1537  });
1538  for (auto &Tup : DupCandidates) {
1539  BranchProbability DupProb;
1540  MachineBasicBlock *Succ;
1541  std::tie(DupProb, Succ) = Tup;
1542  if (DupProb < BestProb)
1543  break;
1544  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1545  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1546  LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1547  << ", probability: " << DupProb
1548  << " (Tail Duplicate)\n");
1549  BestSucc.BB = Succ;
1550  BestSucc.ShouldTailDup = true;
1551  break;
1552  }
1553  }
1554 
1555  if (BestSucc.BB)
1556  LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1557 
1558  return BestSucc;
1559 }
1560 
1561 /// Select the best block from a worklist.
1562 ///
1563 /// This looks through the provided worklist as a list of candidate basic
1564 /// blocks and select the most profitable one to place. The definition of
1565 /// profitable only really makes sense in the context of a loop. This returns
1566 /// the most frequently visited block in the worklist, which in the case of
1567 /// a loop, is the one most desirable to be physically close to the rest of the
1568 /// loop body in order to improve i-cache behavior.
1569 ///
1570 /// \returns The best block found, or null if none are viable.
1571 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1572  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1573  // Once we need to walk the worklist looking for a candidate, cleanup the
1574  // worklist of already placed entries.
1575  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1576  // some code complexity) into the loop below.
1577  WorkList.erase(llvm::remove_if(WorkList,
1578  [&](MachineBasicBlock *BB) {
1579  return BlockToChain.lookup(BB) == &Chain;
1580  }),
1581  WorkList.end());
1582 
1583  if (WorkList.empty())
1584  return nullptr;
1585 
1586  bool IsEHPad = WorkList[0]->isEHPad();
1587 
1588  MachineBasicBlock *BestBlock = nullptr;
1589  BlockFrequency BestFreq;
1590  for (MachineBasicBlock *MBB : WorkList) {
1591  assert(MBB->isEHPad() == IsEHPad &&
1592  "EHPad mismatch between block and work list.");
1593 
1594  BlockChain &SuccChain = *BlockToChain[MBB];
1595  if (&SuccChain == &Chain)
1596  continue;
1597 
1598  assert(SuccChain.UnscheduledPredecessors == 0 &&
1599  "Found CFG-violating block");
1600 
1601  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1602  LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1603  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1604 
1605  // For ehpad, we layout the least probable first as to avoid jumping back
1606  // from least probable landingpads to more probable ones.
1607  //
1608  // FIXME: Using probability is probably (!) not the best way to achieve
1609  // this. We should probably have a more principled approach to layout
1610  // cleanup code.
1611  //
1612  // The goal is to get:
1613  //
1614  // +--------------------------+
1615  // | V
1616  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1617  //
1618  // Rather than:
1619  //
1620  // +-------------------------------------+
1621  // V |
1622  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1623  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1624  continue;
1625 
1626  BestBlock = MBB;
1627  BestFreq = CandidateFreq;
1628  }
1629 
1630  return BestBlock;
1631 }
1632 
1633 /// Retrieve the first unplaced basic block.
1634 ///
1635 /// This routine is called when we are unable to use the CFG to walk through
1636 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1637 /// We walk through the function's blocks in order, starting from the
1638 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1639 /// re-scanning the entire sequence on repeated calls to this routine.
1640 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1641  const BlockChain &PlacedChain,
1642  MachineFunction::iterator &PrevUnplacedBlockIt,
1643  const BlockFilterSet *BlockFilter) {
1644  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1645  ++I) {
1646  if (BlockFilter && !BlockFilter->count(&*I))
1647  continue;
1648  if (BlockToChain[&*I] != &PlacedChain) {
1649  PrevUnplacedBlockIt = I;
1650  // Now select the head of the chain to which the unplaced block belongs
1651  // as the block to place. This will force the entire chain to be placed,
1652  // and satisfies the requirements of merging chains.
1653  return *BlockToChain[&*I]->begin();
1654  }
1655  }
1656  return nullptr;
1657 }
1658 
1659 void MachineBlockPlacement::fillWorkLists(
1660  const MachineBasicBlock *MBB,
1661  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1662  const BlockFilterSet *BlockFilter = nullptr) {
1663  BlockChain &Chain = *BlockToChain[MBB];
1664  if (!UpdatedPreds.insert(&Chain).second)
1665  return;
1666 
1667  assert(
1668  Chain.UnscheduledPredecessors == 0 &&
1669  "Attempting to place block with unscheduled predecessors in worklist.");
1670  for (MachineBasicBlock *ChainBB : Chain) {
1671  assert(BlockToChain[ChainBB] == &Chain &&
1672  "Block in chain doesn't match BlockToChain map.");
1673  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1674  if (BlockFilter && !BlockFilter->count(Pred))
1675  continue;
1676  if (BlockToChain[Pred] == &Chain)
1677  continue;
1678  ++Chain.UnscheduledPredecessors;
1679  }
1680  }
1681 
1682  if (Chain.UnscheduledPredecessors != 0)
1683  return;
1684 
1685  MachineBasicBlock *BB = *Chain.begin();
1686  if (BB->isEHPad())
1687  EHPadWorkList.push_back(BB);
1688  else
1689  BlockWorkList.push_back(BB);
1690 }
1691 
1692 void MachineBlockPlacement::buildChain(
1693  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1694  BlockFilterSet *BlockFilter) {
1695  assert(HeadBB && "BB must not be null.\n");
1696  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1697  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1698 
1699  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1700  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1701  MachineBasicBlock *BB = *std::prev(Chain.end());
1702  while (true) {
1703  assert(BB && "null block found at end of chain in loop.");
1704  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1705  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1706 
1707 
1708  // Look for the best viable successor if there is one to place immediately
1709  // after this block.
1710  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1711  MachineBasicBlock* BestSucc = Result.BB;
1712  bool ShouldTailDup = Result.ShouldTailDup;
1713  if (allowTailDupPlacement())
1714  ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
1715 
1716  // If an immediate successor isn't available, look for the best viable
1717  // block among those we've identified as not violating the loop's CFG at
1718  // this point. This won't be a fallthrough, but it will increase locality.
1719  if (!BestSucc)
1720  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1721  if (!BestSucc)
1722  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1723 
1724  if (!BestSucc) {
1725  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1726  if (!BestSucc)
1727  break;
1728 
1729  LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1730  "layout successor until the CFG reduces\n");
1731  }
1732 
1733  // Placement may have changed tail duplication opportunities.
1734  // Check for that now.
1735  if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1736  // If the chosen successor was duplicated into all its predecessors,
1737  // don't bother laying it out, just go round the loop again with BB as
1738  // the chain end.
1739  if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1740  BlockFilter, PrevUnplacedBlockIt))
1741  continue;
1742  }
1743 
1744  // Place this block, updating the datastructures to reflect its placement.
1745  BlockChain &SuccChain = *BlockToChain[BestSucc];
1746  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1747  // we selected a successor that didn't fit naturally into the CFG.
1748  SuccChain.UnscheduledPredecessors = 0;
1749  LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1750  << getBlockName(BestSucc) << "\n");
1751  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1752  Chain.merge(BestSucc, &SuccChain);
1753  BB = *std::prev(Chain.end());
1754  }
1755 
1756  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1757  << getBlockName(*Chain.begin()) << "\n");
1758 }
1759 
1760 // If bottom of block BB has only one successor OldTop, in most cases it is
1761 // profitable to move it before OldTop, except the following case:
1762 //
1763 // -->OldTop<-
1764 // | . |
1765 // | . |
1766 // | . |
1767 // ---Pred |
1768 // | |
1769 // BB-----
1770 //
1771 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1772 // layout the other successor below it, so it can't reduce taken branch.
1773 // In this case we keep its original layout.
1774 bool
1775 MachineBlockPlacement::canMoveBottomBlockToTop(
1776  const MachineBasicBlock *BottomBlock,
1777  const MachineBasicBlock *OldTop) {
1778  if (BottomBlock->pred_size() != 1)
1779  return true;
1780  MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1781  if (Pred->succ_size() != 2)
1782  return true;
1783 
1784  MachineBasicBlock *OtherBB = *Pred->succ_begin();
1785  if (OtherBB == BottomBlock)
1786  OtherBB = *Pred->succ_rbegin();
1787  if (OtherBB == OldTop)
1788  return false;
1789 
1790  return true;
1791 }
1792 
1793 /// Find the best loop top block for layout.
1794 ///
1795 /// Look for a block which is strictly better than the loop header for laying
1796 /// out at the top of the loop. This looks for one and only one pattern:
1797 /// a latch block with no conditional exit. This block will cause a conditional
1798 /// jump around it or will be the bottom of the loop if we lay it out in place,
1799 /// but if it it doesn't end up at the bottom of the loop for any reason,
1800 /// rotation alone won't fix it. Because such a block will always result in an
1801 /// unconditional jump (for the backedge) rotating it in front of the loop
1802 /// header is always profitable.
1804 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
1805  const BlockFilterSet &LoopBlockSet) {
1806  // Placing the latch block before the header may introduce an extra branch
1807  // that skips this block the first time the loop is executed, which we want
1808  // to avoid when optimising for size.
1809  // FIXME: in theory there is a case that does not introduce a new branch,
1810  // i.e. when the layout predecessor does not fallthrough to the loop header.
1811  // In practice this never happens though: there always seems to be a preheader
1812  // that can fallthrough and that is also placed before the header.
1813  if (F->getFunction().hasOptSize())
1814  return L.getHeader();
1815 
1816  // Check that the header hasn't been fused with a preheader block due to
1817  // crazy branches. If it has, we need to start with the header at the top to
1818  // prevent pulling the preheader into the loop body.
1819  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
1820  if (!LoopBlockSet.count(*HeaderChain.begin()))
1821  return L.getHeader();
1822 
1823  LLVM_DEBUG(dbgs() << "Finding best loop top for: "
1824  << getBlockName(L.getHeader()) << "\n");
1825 
1826  BlockFrequency BestPredFreq;
1827  MachineBasicBlock *BestPred = nullptr;
1828  for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) {
1829  if (!LoopBlockSet.count(Pred))
1830  continue;
1831  LLVM_DEBUG(dbgs() << " header pred: " << getBlockName(Pred) << ", has "
1832  << Pred->succ_size() << " successors, ";
1833  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
1834  if (Pred->succ_size() > 1)
1835  continue;
1836 
1837  if (!canMoveBottomBlockToTop(Pred, L.getHeader()))
1838  continue;
1839 
1840  BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
1841  if (!BestPred || PredFreq > BestPredFreq ||
1842  (!(PredFreq < BestPredFreq) &&
1843  Pred->isLayoutSuccessor(L.getHeader()))) {
1844  BestPred = Pred;
1845  BestPredFreq = PredFreq;
1846  }
1847  }
1848 
1849  // If no direct predecessor is fine, just use the loop header.
1850  if (!BestPred) {
1851  LLVM_DEBUG(dbgs() << " final top unchanged\n");
1852  return L.getHeader();
1853  }
1854 
1855  // Walk backwards through any straight line of predecessors.
1856  while (BestPred->pred_size() == 1 &&
1857  (*BestPred->pred_begin())->succ_size() == 1 &&
1858  *BestPred->pred_begin() != L.getHeader())
1859  BestPred = *BestPred->pred_begin();
1860 
1861  LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
1862  return BestPred;
1863 }
1864 
1865 /// Find the best loop exiting block for layout.
1866 ///
1867 /// This routine implements the logic to analyze the loop looking for the best
1868 /// block to layout at the top of the loop. Typically this is done to maximize
1869 /// fallthrough opportunities.
1871 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
1872  const BlockFilterSet &LoopBlockSet) {
1873  // We don't want to layout the loop linearly in all cases. If the loop header
1874  // is just a normal basic block in the loop, we want to look for what block
1875  // within the loop is the best one to layout at the top. However, if the loop
1876  // header has be pre-merged into a chain due to predecessors not having
1877  // analyzable branches, *and* the predecessor it is merged with is *not* part
1878  // of the loop, rotating the header into the middle of the loop will create
1879  // a non-contiguous range of blocks which is Very Bad. So start with the
1880  // header and only rotate if safe.
1881  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
1882  if (!LoopBlockSet.count(*HeaderChain.begin()))
1883  return nullptr;
1884 
1885  BlockFrequency BestExitEdgeFreq;
1886  unsigned BestExitLoopDepth = 0;
1887  MachineBasicBlock *ExitingBB = nullptr;
1888  // If there are exits to outer loops, loop rotation can severely limit
1889  // fallthrough opportunities unless it selects such an exit. Keep a set of
1890  // blocks where rotating to exit with that block will reach an outer loop.
1891  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
1892 
1893  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
1894  << getBlockName(L.getHeader()) << "\n");
1895  for (MachineBasicBlock *MBB : L.getBlocks()) {
1896  BlockChain &Chain = *BlockToChain[MBB];
1897  // Ensure that this block is at the end of a chain; otherwise it could be
1898  // mid-way through an inner loop or a successor of an unanalyzable branch.
1899  if (MBB != *std::prev(Chain.end()))
1900  continue;
1901 
1902  // Now walk the successors. We need to establish whether this has a viable
1903  // exiting successor and whether it has a viable non-exiting successor.
1904  // We store the old exiting state and restore it if a viable looping
1905  // successor isn't found.
1906  MachineBasicBlock *OldExitingBB = ExitingBB;
1907  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
1908  bool HasLoopingSucc = false;
1909  for (MachineBasicBlock *Succ : MBB->successors()) {
1910  if (Succ->isEHPad())
1911  continue;
1912  if (Succ == MBB)
1913  continue;
1914  BlockChain &SuccChain = *BlockToChain[Succ];
1915  // Don't split chains, either this chain or the successor's chain.
1916  if (&Chain == &SuccChain) {
1917  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
1918  << getBlockName(Succ) << " (chain conflict)\n");
1919  continue;
1920  }
1921 
1922  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
1923  if (LoopBlockSet.count(Succ)) {
1924  LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
1925  << getBlockName(Succ) << " (" << SuccProb << ")\n");
1926  HasLoopingSucc = true;
1927  continue;
1928  }
1929 
1930  unsigned SuccLoopDepth = 0;
1931  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
1932  SuccLoopDepth = ExitLoop->getLoopDepth();
1933  if (ExitLoop->contains(&L))
1934  BlocksExitingToOuterLoop.insert(MBB);
1935  }
1936 
1937  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
1938  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
1939  << getBlockName(Succ) << " [L:" << SuccLoopDepth
1940  << "] (";
1941  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
1942  // Note that we bias this toward an existing layout successor to retain
1943  // incoming order in the absence of better information. The exit must have
1944  // a frequency higher than the current exit before we consider breaking
1945  // the layout.
1946  BranchProbability Bias(100 - ExitBlockBias, 100);
1947  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
1948  ExitEdgeFreq > BestExitEdgeFreq ||
1949  (MBB->isLayoutSuccessor(Succ) &&
1950  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
1951  BestExitEdgeFreq = ExitEdgeFreq;
1952  ExitingBB = MBB;
1953  }
1954  }
1955 
1956  if (!HasLoopingSucc) {
1957  // Restore the old exiting state, no viable looping successor was found.
1958  ExitingBB = OldExitingBB;
1959  BestExitEdgeFreq = OldBestExitEdgeFreq;
1960  }
1961  }
1962  // Without a candidate exiting block or with only a single block in the
1963  // loop, just use the loop header to layout the loop.
1964  if (!ExitingBB) {
1965  LLVM_DEBUG(
1966  dbgs() << " No other candidate exit blocks, using loop header\n");
1967  return nullptr;
1968  }
1969  if (L.getNumBlocks() == 1) {
1970  LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
1971  return nullptr;
1972  }
1973 
1974  // Also, if we have exit blocks which lead to outer loops but didn't select
1975  // one of them as the exiting block we are rotating toward, disable loop
1976  // rotation altogether.
1977  if (!BlocksExitingToOuterLoop.empty() &&
1978  !BlocksExitingToOuterLoop.count(ExitingBB))
1979  return nullptr;
1980 
1981  LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
1982  << "\n");
1983  return ExitingBB;
1984 }
1985 
1986 /// Check if there is a fallthrough to loop header Top.
1987 ///
1988 /// 1. Look for a Pred that can be layout before Top.
1989 /// 2. Check if Top is the most possible successor of Pred.
1990 bool
1991 MachineBlockPlacement::hasViableTopFallthrough(
1992  const MachineBasicBlock *Top,
1993  const BlockFilterSet &LoopBlockSet) {
1994  for (MachineBasicBlock *Pred : Top->predecessors()) {
1995  BlockChain *PredChain = BlockToChain[Pred];
1996  if (!LoopBlockSet.count(Pred) &&
1997  (!PredChain || Pred == *std::prev(PredChain->end()))) {
1998  // Found a Pred block can be placed before Top.
1999  // Check if Top is the best successor of Pred.
2000  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2001  bool TopOK = true;
2002  for (MachineBasicBlock *Succ : Pred->successors()) {
2003  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2004  BlockChain *SuccChain = BlockToChain[Succ];
2005  // Check if Succ can be placed after Pred.
2006  // Succ should not be in any chain, or it is the head of some chain.
2007  if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2008  TopOK = false;
2009  break;
2010  }
2011  }
2012  if (TopOK)
2013  return true;
2014  }
2015  }
2016  return false;
2017 }
2018 
2019 /// Attempt to rotate an exiting block to the bottom of the loop.
2020 ///
2021 /// Once we have built a chain, try to rotate it to line up the hot exit block
2022 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2023 /// branches. For example, if the loop has fallthrough into its header and out
2024 /// of its bottom already, don't rotate it.
2025 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2026  const MachineBasicBlock *ExitingBB,
2027  const BlockFilterSet &LoopBlockSet) {
2028  if (!ExitingBB)
2029  return;
2030 
2031  MachineBasicBlock *Top = *LoopChain.begin();
2032  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2033 
2034  // If ExitingBB is already the last one in a chain then nothing to do.
2035  if (Bottom == ExitingBB)
2036  return;
2037 
2038  bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2039 
2040  // If the header has viable fallthrough, check whether the current loop
2041  // bottom is a viable exiting block. If so, bail out as rotating will
2042  // introduce an unnecessary branch.
2043  if (ViableTopFallthrough) {
2044  for (MachineBasicBlock *Succ : Bottom->successors()) {
2045  BlockChain *SuccChain = BlockToChain[Succ];
2046  if (!LoopBlockSet.count(Succ) &&
2047  (!SuccChain || Succ == *SuccChain->begin()))
2048  return;
2049  }
2050  }
2051 
2052  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2053  if (ExitIt == LoopChain.end())
2054  return;
2055 
2056  // Rotating a loop exit to the bottom when there is a fallthrough to top
2057  // trades the entry fallthrough for an exit fallthrough.
2058  // If there is no bottom->top edge, but the chosen exit block does have
2059  // a fallthrough, we break that fallthrough for nothing in return.
2060 
2061  // Let's consider an example. We have a built chain of basic blocks
2062  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2063  // By doing a rotation we get
2064  // Bk+1, ..., Bn, B1, ..., Bk
2065  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2066  // If we had a fallthrough Bk -> Bk+1 it is broken now.
2067  // It might be compensated by fallthrough Bn -> B1.
2068  // So we have a condition to avoid creation of extra branch by loop rotation.
2069  // All below must be true to avoid loop rotation:
2070  // If there is a fallthrough to top (B1)
2071  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2072  // There is no fallthrough from bottom (Bn) to top (B1).
2073  // Please note that there is no exit fallthrough from Bn because we checked it
2074  // above.
2075  if (ViableTopFallthrough) {
2076  assert(std::next(ExitIt) != LoopChain.end() &&
2077  "Exit should not be last BB");
2078  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2079  if (ExitingBB->isSuccessor(NextBlockInChain))
2080  if (!Bottom->isSuccessor(Top))
2081  return;
2082  }
2083 
2084  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2085  << " at bottom\n");
2086  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2087 }
2088 
2089 /// Attempt to rotate a loop based on profile data to reduce branch cost.
2090 ///
2091 /// With profile data, we can determine the cost in terms of missed fall through
2092 /// opportunities when rotating a loop chain and select the best rotation.
2093 /// Basically, there are three kinds of cost to consider for each rotation:
2094 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2095 /// the loop to the loop header.
2096 /// 2. The possibly missed fall through edges (if they exist) from the loop
2097 /// exits to BB out of the loop.
2098 /// 3. The missed fall through edge (if it exists) from the last BB to the
2099 /// first BB in the loop chain.
2100 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2101 /// We select the best rotation with the smallest cost.
2102 void MachineBlockPlacement::rotateLoopWithProfile(
2103  BlockChain &LoopChain, const MachineLoop &L,
2104  const BlockFilterSet &LoopBlockSet) {
2105  auto HeaderBB = L.getHeader();
2106  auto HeaderIter = llvm::find(LoopChain, HeaderBB);
2107  auto RotationPos = LoopChain.end();
2108 
2109  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2110 
2111  // A utility lambda that scales up a block frequency by dividing it by a
2112  // branch probability which is the reciprocal of the scale.
2113  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2114  unsigned Scale) -> BlockFrequency {
2115  if (Scale == 0)
2116  return 0;
2117  // Use operator / between BlockFrequency and BranchProbability to implement
2118  // saturating multiplication.
2119  return Freq / BranchProbability(1, Scale);
2120  };
2121 
2122  // Compute the cost of the missed fall-through edge to the loop header if the
2123  // chain head is not the loop header. As we only consider natural loops with
2124  // single header, this computation can be done only once.
2125  BlockFrequency HeaderFallThroughCost(0);
2126  for (auto *Pred : HeaderBB->predecessors()) {
2127  BlockChain *PredChain = BlockToChain[Pred];
2128  if (!LoopBlockSet.count(Pred) &&
2129  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2130  auto EdgeFreq =
2131  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB);
2132  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2133  // If the predecessor has only an unconditional jump to the header, we
2134  // need to consider the cost of this jump.
2135  if (Pred->succ_size() == 1)
2136  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2137  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2138  }
2139  }
2140 
2141  // Here we collect all exit blocks in the loop, and for each exit we find out
2142  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2143  // as the sum of frequencies of exit edges we collect here, excluding the exit
2144  // edge from the tail of the loop chain.
2146  for (auto BB : LoopChain) {
2147  auto LargestExitEdgeProb = BranchProbability::getZero();
2148  for (auto *Succ : BB->successors()) {
2149  BlockChain *SuccChain = BlockToChain[Succ];
2150  if (!LoopBlockSet.count(Succ) &&
2151  (!SuccChain || Succ == *SuccChain->begin())) {
2152  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2153  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2154  }
2155  }
2156  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2157  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2158  ExitsWithFreq.emplace_back(BB, ExitFreq);
2159  }
2160  }
2161 
2162  // In this loop we iterate every block in the loop chain and calculate the
2163  // cost assuming the block is the head of the loop chain. When the loop ends,
2164  // we should have found the best candidate as the loop chain's head.
2165  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2166  EndIter = LoopChain.end();
2167  Iter != EndIter; Iter++, TailIter++) {
2168  // TailIter is used to track the tail of the loop chain if the block we are
2169  // checking (pointed by Iter) is the head of the chain.
2170  if (TailIter == LoopChain.end())
2171  TailIter = LoopChain.begin();
2172 
2173  auto TailBB = *TailIter;
2174 
2175  // Calculate the cost by putting this BB to the top.
2176  BlockFrequency Cost = 0;
2177 
2178  // If the current BB is the loop header, we need to take into account the
2179  // cost of the missed fall through edge from outside of the loop to the
2180  // header.
2181  if (Iter != HeaderIter)
2182  Cost += HeaderFallThroughCost;
2183 
2184  // Collect the loop exit cost by summing up frequencies of all exit edges
2185  // except the one from the chain tail.
2186  for (auto &ExitWithFreq : ExitsWithFreq)
2187  if (TailBB != ExitWithFreq.first)
2188  Cost += ExitWithFreq.second;
2189 
2190  // The cost of breaking the once fall-through edge from the tail to the top
2191  // of the loop chain. Here we need to consider three cases:
2192  // 1. If the tail node has only one successor, then we will get an
2193  // additional jmp instruction. So the cost here is (MisfetchCost +
2194  // JumpInstCost) * tail node frequency.
2195  // 2. If the tail node has two successors, then we may still get an
2196  // additional jmp instruction if the layout successor after the loop
2197  // chain is not its CFG successor. Note that the more frequently executed
2198  // jmp instruction will be put ahead of the other one. Assume the
2199  // frequency of those two branches are x and y, where x is the frequency
2200  // of the edge to the chain head, then the cost will be
2201  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2202  // 3. If the tail node has more than two successors (this rarely happens),
2203  // we won't consider any additional cost.
2204  if (TailBB->isSuccessor(*Iter)) {
2205  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2206  if (TailBB->succ_size() == 1)
2207  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2209  else if (TailBB->succ_size() == 2) {
2210  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2211  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2212  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2213  ? TailBBFreq * TailToHeadProb.getCompl()
2214  : TailToHeadFreq;
2215  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2216  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2217  }
2218  }
2219 
2220  LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2221  << getBlockName(*Iter)
2222  << " to the top: " << Cost.getFrequency() << "\n");
2223 
2224  if (Cost < SmallestRotationCost) {
2225  SmallestRotationCost = Cost;
2226  RotationPos = Iter;
2227  }
2228  }
2229 
2230  if (RotationPos != LoopChain.end()) {
2231  LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2232  << " to the top\n");
2233  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2234  }
2235 }
2236 
2237 /// Collect blocks in the given loop that are to be placed.
2238 ///
2239 /// When profile data is available, exclude cold blocks from the returned set;
2240 /// otherwise, collect all blocks in the loop.
2242 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2243  BlockFilterSet LoopBlockSet;
2244 
2245  // Filter cold blocks off from LoopBlockSet when profile data is available.
2246  // Collect the sum of frequencies of incoming edges to the loop header from
2247  // outside. If we treat the loop as a super block, this is the frequency of
2248  // the loop. Then for each block in the loop, we calculate the ratio between
2249  // its frequency and the frequency of the loop block. When it is too small,
2250  // don't add it to the loop chain. If there are outer loops, then this block
2251  // will be merged into the first outer loop chain for which this block is not
2252  // cold anymore. This needs precise profile data and we only do this when
2253  // profile data is available.
2255  BlockFrequency LoopFreq(0);
2256  for (auto LoopPred : L.getHeader()->predecessors())
2257  if (!L.contains(LoopPred))
2258  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2259  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2260 
2261  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2262  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2263  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2264  continue;
2265  LoopBlockSet.insert(LoopBB);
2266  }
2267  } else
2268  LoopBlockSet.insert(L.block_begin(), L.block_end());
2269 
2270  return LoopBlockSet;
2271 }
2272 
2273 /// Forms basic block chains from the natural loop structures.
2274 ///
2275 /// These chains are designed to preserve the existing *structure* of the code
2276 /// as much as possible. We can then stitch the chains together in a way which
2277 /// both preserves the topological structure and minimizes taken conditional
2278 /// branches.
2279 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2280  // First recurse through any nested loops, building chains for those inner
2281  // loops.
2282  for (const MachineLoop *InnerLoop : L)
2283  buildLoopChains(*InnerLoop);
2284 
2285  assert(BlockWorkList.empty() &&
2286  "BlockWorkList not empty when starting to build loop chains.");
2287  assert(EHPadWorkList.empty() &&
2288  "EHPadWorkList not empty when starting to build loop chains.");
2289  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2290 
2291  // Check if we have profile data for this function. If yes, we will rotate
2292  // this loop by modeling costs more precisely which requires the profile data
2293  // for better layout.
2294  bool RotateLoopWithProfile =
2297 
2298  // First check to see if there is an obviously preferable top block for the
2299  // loop. This will default to the header, but may end up as one of the
2300  // predecessors to the header if there is one which will result in strictly
2301  // fewer branches in the loop body.
2302  // When we use profile data to rotate the loop, this is unnecessary.
2303  MachineBasicBlock *LoopTop =
2304  RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet);
2305 
2306  // If we selected just the header for the loop top, look for a potentially
2307  // profitable exit block in the event that rotating the loop can eliminate
2308  // branches by placing an exit edge at the bottom.
2309  //
2310  // Loops are processed innermost to uttermost, make sure we clear
2311  // PreferredLoopExit before processing a new loop.
2312  PreferredLoopExit = nullptr;
2313  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2314  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet);
2315 
2316  BlockChain &LoopChain = *BlockToChain[LoopTop];
2317 
2318  // FIXME: This is a really lame way of walking the chains in the loop: we
2319  // walk the blocks, and use a set to prevent visiting a particular chain
2320  // twice.
2321  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2322  assert(LoopChain.UnscheduledPredecessors == 0 &&
2323  "LoopChain should not have unscheduled predecessors.");
2324  UpdatedPreds.insert(&LoopChain);
2325 
2326  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2327  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2328 
2329  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2330 
2331  if (RotateLoopWithProfile)
2332  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2333  else
2334  rotateLoop(LoopChain, PreferredLoopExit, LoopBlockSet);
2335 
2336  LLVM_DEBUG({
2337  // Crash at the end so we get all of the debugging output first.
2338  bool BadLoop = false;
2339  if (LoopChain.UnscheduledPredecessors) {
2340  BadLoop = true;
2341  dbgs() << "Loop chain contains a block without its preds placed!\n"
2342  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2343  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2344  }
2345  for (MachineBasicBlock *ChainBB : LoopChain) {
2346  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2347  if (!LoopBlockSet.remove(ChainBB)) {
2348  // We don't mark the loop as bad here because there are real situations
2349  // where this can occur. For example, with an unanalyzable fallthrough
2350  // from a loop block to a non-loop block or vice versa.
2351  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2352  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2353  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2354  << " Bad block: " << getBlockName(ChainBB) << "\n";
2355  }
2356  }
2357 
2358  if (!LoopBlockSet.empty()) {
2359  BadLoop = true;
2360  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2361  dbgs() << "Loop contains blocks never placed into a chain!\n"
2362  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2363  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2364  << " Bad block: " << getBlockName(LoopBB) << "\n";
2365  }
2366  assert(!BadLoop && "Detected problems with the placement of this loop.");
2367  });
2368 
2369  BlockWorkList.clear();
2370  EHPadWorkList.clear();
2371 }
2372 
2373 void MachineBlockPlacement::buildCFGChains() {
2374  // Ensure that every BB in the function has an associated chain to simplify
2375  // the assumptions of the remaining algorithm.
2376  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2377  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2378  ++FI) {
2379  MachineBasicBlock *BB = &*FI;
2380  BlockChain *Chain =
2381  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2382  // Also, merge any blocks which we cannot reason about and must preserve
2383  // the exact fallthrough behavior for.
2384  while (true) {
2385  Cond.clear();
2386  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2387  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2388  break;
2389 
2390  MachineFunction::iterator NextFI = std::next(FI);
2391  MachineBasicBlock *NextBB = &*NextFI;
2392  // Ensure that the layout successor is a viable block, as we know that
2393  // fallthrough is a possibility.
2394  assert(NextFI != FE && "Can't fallthrough past the last block.");
2395  LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2396  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2397  << "\n");
2398  Chain->merge(NextBB, nullptr);
2399 #ifndef NDEBUG
2400  BlocksWithUnanalyzableExits.insert(&*BB);
2401 #endif
2402  FI = NextFI;
2403  BB = NextBB;
2404  }
2405  }
2406 
2407  // Build any loop-based chains.
2408  PreferredLoopExit = nullptr;
2409  for (MachineLoop *L : *MLI)
2410  buildLoopChains(*L);
2411 
2412  assert(BlockWorkList.empty() &&
2413  "BlockWorkList should be empty before building final chain.");
2414  assert(EHPadWorkList.empty() &&
2415  "EHPadWorkList should be empty before building final chain.");
2416 
2417  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2418  for (MachineBasicBlock &MBB : *F)
2419  fillWorkLists(&MBB, UpdatedPreds);
2420 
2421  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2422  buildChain(&F->front(), FunctionChain);
2423 
2424 #ifndef NDEBUG
2425  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2426 #endif
2427  LLVM_DEBUG({
2428  // Crash at the end so we get all of the debugging output first.
2429  bool BadFunc = false;
2430  FunctionBlockSetType FunctionBlockSet;
2431  for (MachineBasicBlock &MBB : *F)
2432  FunctionBlockSet.insert(&MBB);
2433 
2434  for (MachineBasicBlock *ChainBB : FunctionChain)
2435  if (!FunctionBlockSet.erase(ChainBB)) {
2436  BadFunc = true;
2437  dbgs() << "Function chain contains a block not in the function!\n"
2438  << " Bad block: " << getBlockName(ChainBB) << "\n";
2439  }
2440 
2441  if (!FunctionBlockSet.empty()) {
2442  BadFunc = true;
2443  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2444  dbgs() << "Function contains blocks never placed into a chain!\n"
2445  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2446  }
2447  assert(!BadFunc && "Detected problems with the block placement.");
2448  });
2449 
2450  // Splice the blocks into place.
2451  MachineFunction::iterator InsertPos = F->begin();
2452  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2453  for (MachineBasicBlock *ChainBB : FunctionChain) {
2454  LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2455  : " ... ")
2456  << getBlockName(ChainBB) << "\n");
2457  if (InsertPos != MachineFunction::iterator(ChainBB))
2458  F->splice(InsertPos, ChainBB);
2459  else
2460  ++InsertPos;
2461 
2462  // Update the terminator of the previous block.
2463  if (ChainBB == *FunctionChain.begin())
2464  continue;
2465  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2466 
2467  // FIXME: It would be awesome of updateTerminator would just return rather
2468  // than assert when the branch cannot be analyzed in order to remove this
2469  // boiler plate.
2470  Cond.clear();
2471  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2472 
2473 #ifndef NDEBUG
2474  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2475  // Given the exact block placement we chose, we may actually not _need_ to
2476  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2477  // do that at this point is a bug.
2478  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2479  !PrevBB->canFallThrough()) &&
2480  "Unexpected block with un-analyzable fallthrough!");
2481  Cond.clear();
2482  TBB = FBB = nullptr;
2483  }
2484 #endif
2485 
2486  // The "PrevBB" is not yet updated to reflect current code layout, so,
2487  // o. it may fall-through to a block without explicit "goto" instruction
2488  // before layout, and no longer fall-through it after layout; or
2489  // o. just opposite.
2490  //
2491  // analyzeBranch() may return erroneous value for FBB when these two
2492  // situations take place. For the first scenario FBB is mistakenly set NULL;
2493  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2494  // mistakenly pointing to "*BI".
2495  // Thus, if the future change needs to use FBB before the layout is set, it
2496  // has to correct FBB first by using the code similar to the following:
2497  //
2498  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2499  // PrevBB->updateTerminator();
2500  // Cond.clear();
2501  // TBB = FBB = nullptr;
2502  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2503  // // FIXME: This should never take place.
2504  // TBB = FBB = nullptr;
2505  // }
2506  // }
2507  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
2508  PrevBB->updateTerminator();
2509  }
2510 
2511  // Fixup the last block.
2512  Cond.clear();
2513  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2514  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
2515  F->back().updateTerminator();
2516 
2517  BlockWorkList.clear();
2518  EHPadWorkList.clear();
2519 }
2520 
2521 void MachineBlockPlacement::optimizeBranches() {
2522  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2523  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2524 
2525  // Now that all the basic blocks in the chain have the proper layout,
2526  // make a final call to AnalyzeBranch with AllowModify set.
2527  // Indeed, the target may be able to optimize the branches in a way we
2528  // cannot because all branches may not be analyzable.
2529  // E.g., the target may be able to remove an unconditional branch to
2530  // a fallthrough when it occurs after predicated terminators.
2531  for (MachineBasicBlock *ChainBB : FunctionChain) {
2532  Cond.clear();
2533  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2534  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2535  // If PrevBB has a two-way branch, try to re-order the branches
2536  // such that we branch to the successor with higher probability first.
2537  if (TBB && !Cond.empty() && FBB &&
2538  MBPI->getEdgeProbability(ChainBB, FBB) >
2539  MBPI->getEdgeProbability(ChainBB, TBB) &&
2540  !TII->reverseBranchCondition(Cond)) {
2541  LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2542  << getBlockName(ChainBB) << "\n");
2543  LLVM_DEBUG(dbgs() << " Edge probability: "
2544  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2545  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2546  DebugLoc dl; // FIXME: this is nowhere
2547  TII->removeBranch(*ChainBB);
2548  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2549  ChainBB->updateTerminator();
2550  }
2551  }
2552  }
2553 }
2554 
2555 void MachineBlockPlacement::alignBlocks() {
2556  // Walk through the backedges of the function now that we have fully laid out
2557  // the basic blocks and align the destination of each backedge. We don't rely
2558  // exclusively on the loop info here so that we can align backedges in
2559  // unnatural CFGs and backedges that were introduced purely because of the
2560  // loop rotations done during this layout pass.
2561  if (F->getFunction().hasMinSize() ||
2562  (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2563  return;
2564  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2565  if (FunctionChain.begin() == FunctionChain.end())
2566  return; // Empty chain.
2567 
2568  const BranchProbability ColdProb(1, 5); // 20%
2569  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2570  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2571  for (MachineBasicBlock *ChainBB : FunctionChain) {
2572  if (ChainBB == *FunctionChain.begin())
2573  continue;
2574 
2575  // Don't align non-looping basic blocks. These are unlikely to execute
2576  // enough times to matter in practice. Note that we'll still handle
2577  // unnatural CFGs inside of a natural outer loop (the common case) and
2578  // rotated loops.
2579  MachineLoop *L = MLI->getLoopFor(ChainBB);
2580  if (!L)
2581  continue;
2582 
2583  unsigned Align = TLI->getPrefLoopAlignment(L);
2584  if (!Align)
2585  continue; // Don't care about loop alignment.
2586 
2587  // If the block is cold relative to the function entry don't waste space
2588  // aligning it.
2589  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2590  if (Freq < WeightedEntryFreq)
2591  continue;
2592 
2593  // If the block is cold relative to its loop header, don't align it
2594  // regardless of what edges into the block exist.
2595  MachineBasicBlock *LoopHeader = L->getHeader();
2596  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2597  if (Freq < (LoopHeaderFreq * ColdProb))
2598  continue;
2599 
2600  // Check for the existence of a non-layout predecessor which would benefit
2601  // from aligning this block.
2602  MachineBasicBlock *LayoutPred =
2603  &*std::prev(MachineFunction::iterator(ChainBB));
2604 
2605  // Force alignment if all the predecessors are jumps. We already checked
2606  // that the block isn't cold above.
2607  if (!LayoutPred->isSuccessor(ChainBB)) {
2608  ChainBB->setAlignment(Align);
2609  continue;
2610  }
2611 
2612  // Align this block if the layout predecessor's edge into this block is
2613  // cold relative to the block. When this is true, other predecessors make up
2614  // all of the hot entries into the block and thus alignment is likely to be
2615  // important.
2616  BranchProbability LayoutProb =
2617  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2618  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2619  if (LayoutEdgeFreq <= (Freq * ColdProb))
2620  ChainBB->setAlignment(Align);
2621  }
2622 }
2623 
2624 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2625 /// it was duplicated into its chain predecessor and removed.
2626 /// \p BB - Basic block that may be duplicated.
2627 ///
2628 /// \p LPred - Chosen layout predecessor of \p BB.
2629 /// Updated to be the chain end if LPred is removed.
2630 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2631 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2632 /// Used to identify which blocks to update predecessor
2633 /// counts.
2634 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2635 /// chosen in the given order due to unnatural CFG
2636 /// only needed if \p BB is removed and
2637 /// \p PrevUnplacedBlockIt pointed to \p BB.
2638 /// @return true if \p BB was removed.
2639 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2640  MachineBasicBlock *BB, MachineBasicBlock *&LPred,
2641  const MachineBasicBlock *LoopHeaderBB,
2642  BlockChain &Chain, BlockFilterSet *BlockFilter,
2643  MachineFunction::iterator &PrevUnplacedBlockIt) {
2644  bool Removed, DuplicatedToLPred;
2645  bool DuplicatedToOriginalLPred;
2646  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
2647  PrevUnplacedBlockIt,
2648  DuplicatedToLPred);
2649  if (!Removed)
2650  return false;
2651  DuplicatedToOriginalLPred = DuplicatedToLPred;
2652  // Iteratively try to duplicate again. It can happen that a block that is
2653  // duplicated into is still small enough to be duplicated again.
2654  // No need to call markBlockSuccessors in this case, as the blocks being
2655  // duplicated from here on are already scheduled.
2656  // Note that DuplicatedToLPred always implies Removed.
2657  while (DuplicatedToLPred) {
2658  assert(Removed && "Block must have been removed to be duplicated into its "
2659  "layout predecessor.");
2660  MachineBasicBlock *DupBB, *DupPred;
2661  // The removal callback causes Chain.end() to be updated when a block is
2662  // removed. On the first pass through the loop, the chain end should be the
2663  // same as it was on function entry. On subsequent passes, because we are
2664  // duplicating the block at the end of the chain, if it is removed the
2665  // chain will have shrunk by one block.
2666  BlockChain::iterator ChainEnd = Chain.end();
2667  DupBB = *(--ChainEnd);
2668  // Now try to duplicate again.
2669  if (ChainEnd == Chain.begin())
2670  break;
2671  DupPred = *std::prev(ChainEnd);
2672  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
2673  PrevUnplacedBlockIt,
2674  DuplicatedToLPred);
2675  }
2676  // If BB was duplicated into LPred, it is now scheduled. But because it was
2677  // removed, markChainSuccessors won't be called for its chain. Instead we
2678  // call markBlockSuccessors for LPred to achieve the same effect. This must go
2679  // at the end because repeating the tail duplication can increase the number
2680  // of unscheduled predecessors.
2681  LPred = *std::prev(Chain.end());
2682  if (DuplicatedToOriginalLPred)
2683  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
2684  return true;
2685 }
2686 
2687 /// Tail duplicate \p BB into (some) predecessors if profitable.
2688 /// \p BB - Basic block that may be duplicated
2689 /// \p LPred - Chosen layout predecessor of \p BB
2690 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2691 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2692 /// Used to identify which blocks to update predecessor
2693 /// counts.
2694 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2695 /// chosen in the given order due to unnatural CFG
2696 /// only needed if \p BB is removed and
2697 /// \p PrevUnplacedBlockIt pointed to \p BB.
2698 /// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will
2699 /// only be true if the block was removed.
2700 /// \return - True if the block was duplicated into all preds and removed.
2701 bool MachineBlockPlacement::maybeTailDuplicateBlock(
2703  BlockChain &Chain, BlockFilterSet *BlockFilter,
2704  MachineFunction::iterator &PrevUnplacedBlockIt,
2705  bool &DuplicatedToLPred) {
2706  DuplicatedToLPred = false;
2707  if (!shouldTailDuplicate(BB))
2708  return false;
2709 
2710  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
2711  << "\n");
2712 
2713  // This has to be a callback because none of it can be done after
2714  // BB is deleted.
2715  bool Removed = false;
2716  auto RemovalCallback =
2717  [&](MachineBasicBlock *RemBB) {
2718  // Signal to outer function
2719  Removed = true;
2720 
2721  // Conservative default.
2722  bool InWorkList = true;
2723  // Remove from the Chain and Chain Map
2724  if (BlockToChain.count(RemBB)) {
2725  BlockChain *Chain = BlockToChain[RemBB];
2726  InWorkList = Chain->UnscheduledPredecessors == 0;
2727  Chain->remove(RemBB);
2728  BlockToChain.erase(RemBB);
2729  }
2730 
2731  // Handle the unplaced block iterator
2732  if (&(*PrevUnplacedBlockIt) == RemBB) {
2733  PrevUnplacedBlockIt++;
2734  }
2735 
2736  // Handle the Work Lists
2737  if (InWorkList) {
2738  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
2739  if (RemBB->isEHPad())
2740  RemoveList = EHPadWorkList;
2741  RemoveList.erase(
2742  llvm::remove_if(RemoveList,
2743  [RemBB](MachineBasicBlock *BB) {
2744  return BB == RemBB;
2745  }),
2746  RemoveList.end());
2747  }
2748 
2749  // Handle the filter set
2750  if (BlockFilter) {
2751  BlockFilter->remove(RemBB);
2752  }
2753 
2754  // Remove the block from loop info.
2755  MLI->removeBlock(RemBB);
2756  if (RemBB == PreferredLoopExit)
2757  PreferredLoopExit = nullptr;
2758 
2759  LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
2760  << getBlockName(RemBB) << "\n");
2761  };
2762  auto RemovalCallbackRef =
2763  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
2764 
2765  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
2766  bool IsSimple = TailDup.isSimpleBB(BB);
2767  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
2768  &DuplicatedPreds, &RemovalCallbackRef);
2769 
2770  // Update UnscheduledPredecessors to reflect tail-duplication.
2771  DuplicatedToLPred = false;
2772  for (MachineBasicBlock *Pred : DuplicatedPreds) {
2773  // We're only looking for unscheduled predecessors that match the filter.
2774  BlockChain* PredChain = BlockToChain[Pred];
2775  if (Pred == LPred)
2776  DuplicatedToLPred = true;
2777  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
2778  || PredChain == &Chain)
2779  continue;
2780  for (MachineBasicBlock *NewSucc : Pred->successors()) {
2781  if (BlockFilter && !BlockFilter->count(NewSucc))
2782  continue;
2783  BlockChain *NewChain = BlockToChain[NewSucc];
2784  if (NewChain != &Chain && NewChain != PredChain)
2785  NewChain->UnscheduledPredecessors++;
2786  }
2787  }
2788  return Removed;
2789 }
2790 
2791 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
2792  if (skipFunction(MF.getFunction()))
2793  return false;
2794 
2795  // Check for single-block functions and skip them.
2796  if (std::next(MF.begin()) == MF.end())
2797  return false;
2798 
2799  F = &MF;
2800  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2801  MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
2802  getAnalysis<MachineBlockFrequencyInfo>());
2803  MLI = &getAnalysis<MachineLoopInfo>();
2804  TII = MF.getSubtarget().getInstrInfo();
2805  TLI = MF.getSubtarget().getTargetLowering();
2806  MPDT = nullptr;
2807 
2808  // Initialize PreferredLoopExit to nullptr here since it may never be set if
2809  // there are no MachineLoops.
2810  PreferredLoopExit = nullptr;
2811 
2812  assert(BlockToChain.empty() &&
2813  "BlockToChain map should be empty before starting placement.");
2814  assert(ComputedEdges.empty() &&
2815  "Computed Edge map should be empty before starting placement.");
2816 
2817  unsigned TailDupSize = TailDupPlacementThreshold;
2818  // If only the aggressive threshold is explicitly set, use it.
2819  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
2820  TailDupPlacementThreshold.getNumOccurrences() == 0)
2822 
2823  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
2824  // For aggressive optimization, we can adjust some thresholds to be less
2825  // conservative.
2826  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
2827  // At O3 we should be more willing to copy blocks for tail duplication. This
2828  // increases size pressure, so we only do it at O3
2829  // Do this unless only the regular threshold is explicitly set.
2830  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
2831  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
2833  }
2834 
2835  if (allowTailDupPlacement()) {
2836  MPDT = &getAnalysis<MachinePostDominatorTree>();
2837  if (MF.getFunction().hasOptSize())
2838  TailDupSize = 1;
2839  bool PreRegAlloc = false;
2840  TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize);
2841  precomputeTriangleChains();
2842  }
2843 
2844  buildCFGChains();
2845 
2846  // Changing the layout can create new tail merging opportunities.
2847  // TailMerge can create jump into if branches that make CFG irreducible for
2848  // HW that requires structured CFG.
2849  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
2850  PassConfig->getEnableTailMerge() &&
2852  // No tail merging opportunities if the block number is less than four.
2853  if (MF.size() > 3 && EnableTailMerge) {
2854  unsigned TailMergeSize = TailDupSize + 1;
2855  BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
2856  *MBPI, TailMergeSize);
2857 
2858  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
2859  getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
2860  /*AfterBlockPlacement=*/true)) {
2861  // Redo the layout if tail merging creates/removes/moves blocks.
2862  BlockToChain.clear();
2863  ComputedEdges.clear();
2864  // Must redo the post-dominator tree if blocks were changed.
2865  if (MPDT)
2866  MPDT->runOnMachineFunction(MF);
2867  ChainAllocator.DestroyAll();
2868  buildCFGChains();
2869  }
2870  }
2871 
2872  optimizeBranches();
2873  alignBlocks();
2874 
2875  BlockToChain.clear();
2876  ComputedEdges.clear();
2877  ChainAllocator.DestroyAll();
2878 
2879  if (AlignAllBlock)
2880  // Align all of the blocks in the function to a specific alignment.
2881  for (MachineBasicBlock &MBB : MF)
2883  else if (AlignAllNonFallThruBlocks) {
2884  // Align all of the blocks that have no fall-through predecessors to a
2885  // specific alignment.
2886  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
2887  auto LayoutPred = std::prev(MBI);
2888  if (!LayoutPred->isSuccessor(&*MBI))
2889  MBI->setAlignment(AlignAllNonFallThruBlocks);
2890  }
2891  }
2892  if (ViewBlockLayoutWithBFI != GVDT_None &&
2893  (ViewBlockFreqFuncName.empty() ||
2894  F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
2895  MBFI->view("MBP." + MF.getName(), false);
2896  }
2897 
2898 
2899  // We always return true as we have no way to track whether the final order
2900  // differs from the original order.
2901  return true;
2902 }
2903 
2904 namespace {
2905 
2906 /// A pass to compute block placement statistics.
2907 ///
2908 /// A separate pass to compute interesting statistics for evaluating block
2909 /// placement. This is separate from the actual placement pass so that they can
2910 /// be computed in the absence of any placement transformations or when using
2911 /// alternative placement strategies.
2912 class MachineBlockPlacementStats : public MachineFunctionPass {
2913  /// A handle to the branch probability pass.
2914  const MachineBranchProbabilityInfo *MBPI;
2915 
2916  /// A handle to the function-wide block frequency pass.
2917  const MachineBlockFrequencyInfo *MBFI;
2918 
2919 public:
2920  static char ID; // Pass identification, replacement for typeid
2921 
2922  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
2924  }
2925 
2926  bool runOnMachineFunction(MachineFunction &F) override;
2927 
2928  void getAnalysisUsage(AnalysisUsage &AU) const override {
2931  AU.setPreservesAll();
2933  }
2934 };
2935 
2936 } // end anonymous namespace
2937 
2939 
2941 
2942 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
2943  "Basic Block Placement Stats", false, false)
2946 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
2947  "Basic Block Placement Stats", false, false)
2948 
2949 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
2950  // Check for single-block functions and skip them.
2951  if (std::next(F.begin()) == F.end())
2952  return false;
2953 
2954  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2955  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
2956 
2957  for (MachineBasicBlock &MBB : F) {
2958  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
2959  Statistic &NumBranches =
2960  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
2961  Statistic &BranchTakenFreq =
2962  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
2963  for (MachineBasicBlock *Succ : MBB.successors()) {
2964  // Skip if this successor is a fallthrough.
2965  if (MBB.isLayoutSuccessor(Succ))
2966  continue;
2967 
2968  BlockFrequency EdgeFreq =
2969  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
2970  ++NumBranches;
2971  BranchTakenFreq += EdgeFreq.getFrequency();
2972  }
2973  }
2974 
2975  return false;
2976 }
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:645
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
BranchProbability getCompl() const
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:320
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:473
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:603
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
unsigned size() const
bool requiresStructuredCFG() const
virtual const TargetLowering * getTargetLowering() const
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const
Insert branch code into the end of the specified MachineBasicBlock.
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:116
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:33
F(f)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static BranchProbability getOne()
virtual unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const
Remove the branching code at the end of the specific MBB.
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void setAlignment(unsigned Align)
Set alignment of the basic block.
CodeGenOpt::Level getOptLevel() const
This file defines the MallocAllocator and BumpPtrAllocator interfaces.
iterator_range< succ_iterator > successors()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:221
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:455
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli=nullptr, bool AfterPlacement=false)
Perhaps branch folding, tail merging and other CFG optimizations on the given function.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
void removeBlock(MachineBasicBlock *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
block placement Basic Block Placement Stats
Target-Independent Code Generator Pass Configuration Options.
BlockT * getHeader() const
Definition: LoopInfo.h:100
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:266
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:462
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
virtual const TargetInstrInfo * getInstrInfo() const
cl::opt< std::string > ViewBlockFreqFuncName
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1251
TargetInstrInfo - Interface to description of machine instruction set.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
cl::opt< unsigned > ProfileLikelyProb
bool getEnableTailMerge() const
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
#define P(N)
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock *> *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
cl::opt< GVDAGType > ViewBlockLayoutWithBFI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:290
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
succ_reverse_iterator succ_rbegin()
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
virtual unsigned getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Represent the analysis usage information of a pass.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency&#39;s with a small penalty for A.
iterator_range< pred_iterator > predecessors()
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:308
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:490
auto remove_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1225
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e...
iterator erase(const_iterator CI)
Definition: SmallVector.h:438
const MachineBasicBlock & front() const
size_t size() const
Definition: SmallVector.h:52
void initializeMachineBlockPlacementPass(PassRegistry &)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1206
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Branch Probability Basic Block Placement
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
size_type size() const
Definition: SmallPtrSet.h:92
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:110
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:297
Iterator for intrusive lists based on ilist_node.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
loop rotate
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:841
void updateTerminator()
Update the terminator instructions in block to account for changes to the layout. ...
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:441
unsigned pred_size() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
void setPreservesAll()
Set by analyses that do not transform their input at all.
typename SuperClass::iterator iterator
Definition: SmallVector.h:319
unsigned succ_size() const
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
cl::opt< unsigned > StaticLikelyProb
LLVM_NODISCARD bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:160
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:163
unsigned succ_size(const Instruction *I)
Definition: CFG.h:256
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all " "blocks in the function."), cl::init(0), cl::Hidden)
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:149
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock *> &Successors)
Check if BB has exactly the successors in Successors.
#define I(x, y, z)
Definition: MD5.cpp:58
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:600
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
block_iterator block_end() const
Definition: LoopInfo.h:155
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:171
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:211
virtual bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const
Reverses the branch condition of the specified condition list, returning false on success and true if...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void stable_sort(R &&Range)
Definition: STLExtras.h:1309
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:482
#define DEBUG_TYPE
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
static BranchProbability getZero()
Utility class to perform tail duplication.
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
#define LLVM_DEBUG(X)
Definition: Debug.h:122
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
block_iterator block_begin() const
Definition: LoopInfo.h:154
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all " "blocks that have no fall-through predecessors (i.e. don't add " "nops that are executed)."), cl::init(0), cl::Hidden)
uint32_t getNumerator() const
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
This file describes how to lower LLVM code to machine code.