LLVM  9.0.0svn
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements basic block placement transformations using the CFG
10 // structure and branch probability estimates.
11 //
12 // The pass strives to preserve the structure of the CFG (that is, retain
13 // a topological ordering of basic blocks) in the absence of a *strong* signal
14 // to the contrary from probabilities. However, within the CFG structure, it
15 // attempts to choose an ordering which favors placing more likely sequences of
16 // blocks adjacent to each other.
17 //
18 // The algorithm works from the inner-most loop within a function outward, and
19 // at each stage walks through the basic blocks, trying to coalesce them into
20 // sequential chains where allowed by the CFG (or demanded by heavy
21 // probabilities). Finally, it walks the blocks in topological order, and the
22 // first time it reaches a chain of basic blocks, it schedules them in the
23 // function in-order.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "BranchFolding.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
49 #include "llvm/IR/DebugLoc.h"
50 #include "llvm/IR/Function.h"
51 #include "llvm/Pass.h"
52 #include "llvm/Support/Allocator.h"
55 #include "llvm/Support/CodeGen.h"
57 #include "llvm/Support/Compiler.h"
58 #include "llvm/Support/Debug.h"
61 #include <algorithm>
62 #include <cassert>
63 #include <cstdint>
64 #include <iterator>
65 #include <memory>
66 #include <string>
67 #include <tuple>
68 #include <utility>
69 #include <vector>
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "block-placement"
74 
75 STATISTIC(NumCondBranches, "Number of conditional branches");
76 STATISTIC(NumUncondBranches, "Number of unconditional branches");
77 STATISTIC(CondBranchTakenFreq,
78  "Potential frequency of taking conditional branches");
79 STATISTIC(UncondBranchTakenFreq,
80  "Potential frequency of taking unconditional branches");
81 
82 static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
83  cl::desc("Force the alignment of all "
84  "blocks in the function."),
85  cl::init(0), cl::Hidden);
86 
88  "align-all-nofallthru-blocks",
89  cl::desc("Force the alignment of all "
90  "blocks that have no fall-through predecessors (i.e. don't add "
91  "nops that are executed)."),
92  cl::init(0), cl::Hidden);
93 
94 // FIXME: Find a good default for this flag and remove the flag.
96  "block-placement-exit-block-bias",
97  cl::desc("Block frequency percentage a loop exit block needs "
98  "over the original exit to be considered the new exit."),
99  cl::init(0), cl::Hidden);
100 
101 // Definition:
102 // - Outlining: placement of a basic block outside the chain or hot path.
103 
105  "loop-to-cold-block-ratio",
106  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
107  "(frequency of block) is greater than this ratio"),
108  cl::init(5), cl::Hidden);
109 
111  "force-loop-cold-block",
112  cl::desc("Force outlining cold blocks from loops."),
113  cl::init(false), cl::Hidden);
114 
115 static cl::opt<bool>
116  PreciseRotationCost("precise-rotation-cost",
117  cl::desc("Model the cost of loop rotation more "
118  "precisely by using profile data."),
119  cl::init(false), cl::Hidden);
120 
121 static cl::opt<bool>
122  ForcePreciseRotationCost("force-precise-rotation-cost",
123  cl::desc("Force the use of precise cost "
124  "loop rotation strategy."),
125  cl::init(false), cl::Hidden);
126 
128  "misfetch-cost",
129  cl::desc("Cost that models the probabilistic risk of an instruction "
130  "misfetch due to a jump comparing to falling through, whose cost "
131  "is zero."),
132  cl::init(1), cl::Hidden);
133 
134 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
135  cl::desc("Cost of jump instructions."),
136  cl::init(1), cl::Hidden);
137 static cl::opt<bool>
138 TailDupPlacement("tail-dup-placement",
139  cl::desc("Perform tail duplication during placement. "
140  "Creates more fallthrough opportunites in "
141  "outline branches."),
142  cl::init(true), cl::Hidden);
143 
144 static cl::opt<bool>
145 BranchFoldPlacement("branch-fold-placement",
146  cl::desc("Perform branch folding during placement. "
147  "Reduces code size."),
148  cl::init(true), cl::Hidden);
149 
150 // Heuristic for tail duplication.
152  "tail-dup-placement-threshold",
153  cl::desc("Instruction cutoff for tail duplication during layout. "
154  "Tail merging during layout is forced to have a threshold "
155  "that won't conflict."), cl::init(2),
156  cl::Hidden);
157 
158 // Heuristic for aggressive tail duplication.
160  "tail-dup-placement-aggressive-threshold",
161  cl::desc("Instruction cutoff for aggressive tail duplication during "
162  "layout. Used at -O3. Tail merging during layout is forced to "
163  "have a threshold that won't conflict."), cl::init(4),
164  cl::Hidden);
165 
166 // Heuristic for tail duplication.
168  "tail-dup-placement-penalty",
169  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
170  "Copying can increase fallthrough, but it also increases icache "
171  "pressure. This parameter controls the penalty to account for that. "
172  "Percent as integer."),
173  cl::init(2),
174  cl::Hidden);
175 
176 // Heuristic for triangle chains.
178  "triangle-chain-count",
179  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
180  "triangle tail duplication heuristic to kick in. 0 to disable."),
181  cl::init(2),
182  cl::Hidden);
183 
186 
187 // Internal option used to control BFI display only after MBP pass.
188 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
189 // -view-block-layout-with-bfi=
191 
192 // Command line option to specify the name of the function for CFG dump
193 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
195 
196 namespace {
197 
198 class BlockChain;
199 
200 /// Type for our function-wide basic block -> block chain mapping.
201 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
202 
203 /// A chain of blocks which will be laid out contiguously.
204 ///
205 /// This is the datastructure representing a chain of consecutive blocks that
206 /// are profitable to layout together in order to maximize fallthrough
207 /// probabilities and code locality. We also can use a block chain to represent
208 /// a sequence of basic blocks which have some external (correctness)
209 /// requirement for sequential layout.
210 ///
211 /// Chains can be built around a single basic block and can be merged to grow
212 /// them. They participate in a block-to-chain mapping, which is updated
213 /// automatically as chains are merged together.
214 class BlockChain {
215  /// The sequence of blocks belonging to this chain.
216  ///
217  /// This is the sequence of blocks for a particular chain. These will be laid
218  /// out in-order within the function.
220 
221  /// A handle to the function-wide basic block to block chain mapping.
222  ///
223  /// This is retained in each block chain to simplify the computation of child
224  /// block chains for SCC-formation and iteration. We store the edges to child
225  /// basic blocks, and map them back to their associated chains using this
226  /// structure.
227  BlockToChainMapType &BlockToChain;
228 
229 public:
230  /// Construct a new BlockChain.
231  ///
232  /// This builds a new block chain representing a single basic block in the
233  /// function. It also registers itself as the chain that block participates
234  /// in with the BlockToChain mapping.
235  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
236  : Blocks(1, BB), BlockToChain(BlockToChain) {
237  assert(BB && "Cannot create a chain with a null basic block");
238  BlockToChain[BB] = this;
239  }
240 
241  /// Iterator over blocks within the chain.
244 
245  /// Beginning of blocks within the chain.
246  iterator begin() { return Blocks.begin(); }
247  const_iterator begin() const { return Blocks.begin(); }
248 
249  /// End of blocks within the chain.
250  iterator end() { return Blocks.end(); }
251  const_iterator end() const { return Blocks.end(); }
252 
253  bool remove(MachineBasicBlock* BB) {
254  for(iterator i = begin(); i != end(); ++i) {
255  if (*i == BB) {
256  Blocks.erase(i);
257  return true;
258  }
259  }
260  return false;
261  }
262 
263  /// Merge a block chain into this one.
264  ///
265  /// This routine merges a block chain into this one. It takes care of forming
266  /// a contiguous sequence of basic blocks, updating the edge list, and
267  /// updating the block -> chain mapping. It does not free or tear down the
268  /// old chain, but the old chain's block list is no longer valid.
269  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
270  assert(BB && "Can't merge a null block.");
271  assert(!Blocks.empty() && "Can't merge into an empty chain.");
272 
273  // Fast path in case we don't have a chain already.
274  if (!Chain) {
275  assert(!BlockToChain[BB] &&
276  "Passed chain is null, but BB has entry in BlockToChain.");
277  Blocks.push_back(BB);
278  BlockToChain[BB] = this;
279  return;
280  }
281 
282  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
283  assert(Chain->begin() != Chain->end());
284 
285  // Update the incoming blocks to point to this chain, and add them to the
286  // chain structure.
287  for (MachineBasicBlock *ChainBB : *Chain) {
288  Blocks.push_back(ChainBB);
289  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
290  BlockToChain[ChainBB] = this;
291  }
292  }
293 
294 #ifndef NDEBUG
295  /// Dump the blocks in this chain.
296  LLVM_DUMP_METHOD void dump() {
297  for (MachineBasicBlock *MBB : *this)
298  MBB->dump();
299  }
300 #endif // NDEBUG
301 
302  /// Count of predecessors of any block within the chain which have not
303  /// yet been scheduled. In general, we will delay scheduling this chain
304  /// until those predecessors are scheduled (or we find a sufficiently good
305  /// reason to override this heuristic.) Note that when forming loop chains,
306  /// blocks outside the loop are ignored and treated as if they were already
307  /// scheduled.
308  ///
309  /// Note: This field is reinitialized multiple times - once for each loop,
310  /// and then once for the function as a whole.
311  unsigned UnscheduledPredecessors = 0;
312 };
313 
314 class MachineBlockPlacement : public MachineFunctionPass {
315  /// A type for a block filter set.
316  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
317 
318  /// Pair struct containing basic block and taildup profitability
319  struct BlockAndTailDupResult {
320  MachineBasicBlock *BB;
321  bool ShouldTailDup;
322  };
323 
324  /// Triple struct containing edge weight and the edge.
325  struct WeightedEdge {
326  BlockFrequency Weight;
327  MachineBasicBlock *Src;
328  MachineBasicBlock *Dest;
329  };
330 
331  /// work lists of blocks that are ready to be laid out
334 
335  /// Edges that have already been computed as optimal.
337 
338  /// Machine Function
340 
341  /// A handle to the branch probability pass.
342  const MachineBranchProbabilityInfo *MBPI;
343 
344  /// A handle to the function-wide block frequency pass.
345  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
346 
347  /// A handle to the loop info.
348  MachineLoopInfo *MLI;
349 
350  /// Preferred loop exit.
351  /// Member variable for convenience. It may be removed by duplication deep
352  /// in the call stack.
353  MachineBasicBlock *PreferredLoopExit;
354 
355  /// A handle to the target's instruction info.
356  const TargetInstrInfo *TII;
357 
358  /// A handle to the target's lowering info.
359  const TargetLoweringBase *TLI;
360 
361  /// A handle to the post dominator tree.
363 
364  /// Duplicator used to duplicate tails during placement.
365  ///
366  /// Placement decisions can open up new tail duplication opportunities, but
367  /// since tail duplication affects placement decisions of later blocks, it
368  /// must be done inline.
369  TailDuplicator TailDup;
370 
371  /// Allocator and owner of BlockChain structures.
372  ///
373  /// We build BlockChains lazily while processing the loop structure of
374  /// a function. To reduce malloc traffic, we allocate them using this
375  /// slab-like allocator, and destroy them after the pass completes. An
376  /// important guarantee is that this allocator produces stable pointers to
377  /// the chains.
379 
380  /// Function wide BasicBlock to BlockChain mapping.
381  ///
382  /// This mapping allows efficiently moving from any given basic block to the
383  /// BlockChain it participates in, if any. We use it to, among other things,
384  /// allow implicitly defining edges between chains as the existing edges
385  /// between basic blocks.
387 
388 #ifndef NDEBUG
389  /// The set of basic blocks that have terminators that cannot be fully
390  /// analyzed. These basic blocks cannot be re-ordered safely by
391  /// MachineBlockPlacement, and we must preserve physical layout of these
392  /// blocks and their successors through the pass.
393  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
394 #endif
395 
396  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
397  /// if the count goes to 0, add them to the appropriate work list.
398  void markChainSuccessors(
399  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
400  const BlockFilterSet *BlockFilter = nullptr);
401 
402  /// Decrease the UnscheduledPredecessors count for a single block, and
403  /// if the count goes to 0, add them to the appropriate work list.
404  void markBlockSuccessors(
405  const BlockChain &Chain, const MachineBasicBlock *BB,
406  const MachineBasicBlock *LoopHeaderBB,
407  const BlockFilterSet *BlockFilter = nullptr);
408 
410  collectViableSuccessors(
411  const MachineBasicBlock *BB, const BlockChain &Chain,
412  const BlockFilterSet *BlockFilter,
414  bool shouldPredBlockBeOutlined(
415  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
416  const BlockChain &Chain, const BlockFilterSet *BlockFilter,
417  BranchProbability SuccProb, BranchProbability HotProb);
418  bool repeatedlyTailDuplicateBlock(
420  const MachineBasicBlock *LoopHeaderBB,
421  BlockChain &Chain, BlockFilterSet *BlockFilter,
422  MachineFunction::iterator &PrevUnplacedBlockIt);
423  bool maybeTailDuplicateBlock(
425  BlockChain &Chain, BlockFilterSet *BlockFilter,
426  MachineFunction::iterator &PrevUnplacedBlockIt,
427  bool &DuplicatedToLPred);
428  bool hasBetterLayoutPredecessor(
429  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
430  const BlockChain &SuccChain, BranchProbability SuccProb,
431  BranchProbability RealSuccProb, const BlockChain &Chain,
432  const BlockFilterSet *BlockFilter);
433  BlockAndTailDupResult selectBestSuccessor(
434  const MachineBasicBlock *BB, const BlockChain &Chain,
435  const BlockFilterSet *BlockFilter);
436  MachineBasicBlock *selectBestCandidateBlock(
437  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
438  MachineBasicBlock *getFirstUnplacedBlock(
439  const BlockChain &PlacedChain,
440  MachineFunction::iterator &PrevUnplacedBlockIt,
441  const BlockFilterSet *BlockFilter);
442 
443  /// Add a basic block to the work list if it is appropriate.
444  ///
445  /// If the optional parameter BlockFilter is provided, only MBB
446  /// present in the set will be added to the worklist. If nullptr
447  /// is provided, no filtering occurs.
448  void fillWorkLists(const MachineBasicBlock *MBB,
449  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
450  const BlockFilterSet *BlockFilter);
451 
452  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
453  BlockFilterSet *BlockFilter = nullptr);
454  bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
455  const MachineBasicBlock *OldTop);
456  bool hasViableTopFallthrough(const MachineBasicBlock *Top,
457  const BlockFilterSet &LoopBlockSet);
458  MachineBasicBlock *findBestLoopTop(
459  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
460  MachineBasicBlock *findBestLoopExit(
461  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
462  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
463  void buildLoopChains(const MachineLoop &L);
464  void rotateLoop(
465  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
466  const BlockFilterSet &LoopBlockSet);
467  void rotateLoopWithProfile(
468  BlockChain &LoopChain, const MachineLoop &L,
469  const BlockFilterSet &LoopBlockSet);
470  void buildCFGChains();
471  void optimizeBranches();
472  void alignBlocks();
473  /// Returns true if a block should be tail-duplicated to increase fallthrough
474  /// opportunities.
475  bool shouldTailDuplicate(MachineBasicBlock *BB);
476  /// Check the edge frequencies to see if tail duplication will increase
477  /// fallthroughs.
478  bool isProfitableToTailDup(
479  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
480  BranchProbability QProb,
481  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
482 
483  /// Check for a trellis layout.
484  bool isTrellis(const MachineBasicBlock *BB,
485  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
486  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
487 
488  /// Get the best successor given a trellis layout.
489  BlockAndTailDupResult getBestTrellisSuccessor(
490  const MachineBasicBlock *BB,
491  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
492  BranchProbability AdjustedSumProb, const BlockChain &Chain,
493  const BlockFilterSet *BlockFilter);
494 
495  /// Get the best pair of non-conflicting edges.
496  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
497  const MachineBasicBlock *BB,
499 
500  /// Returns true if a block can tail duplicate into all unplaced
501  /// predecessors. Filters based on loop.
502  bool canTailDuplicateUnplacedPreds(
503  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
504  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
505 
506  /// Find chains of triangles to tail-duplicate where a global analysis works,
507  /// but a local analysis would not find them.
508  void precomputeTriangleChains();
509 
510 public:
511  static char ID; // Pass identification, replacement for typeid
512 
513  MachineBlockPlacement() : MachineFunctionPass(ID) {
515  }
516 
517  bool runOnMachineFunction(MachineFunction &F) override;
518 
519  bool allowTailDupPlacement() const {
520  assert(F);
522  }
523 
524  void getAnalysisUsage(AnalysisUsage &AU) const override {
527  if (TailDupPlacement)
532  }
533 };
534 
535 } // end anonymous namespace
536 
538 
540 
541 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
542  "Branch Probability Basic Block Placement", false, false)
547 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
548  "Branch Probability Basic Block Placement", false, false)
549 
550 #ifndef NDEBUG
551 /// Helper to print the name of a MBB.
552 ///
553 /// Only used by debug logging.
554 static std::string getBlockName(const MachineBasicBlock *BB) {
555  std::string Result;
556  raw_string_ostream OS(Result);
557  OS << printMBBReference(*BB);
558  OS << " ('" << BB->getName() << "')";
559  OS.flush();
560  return Result;
561 }
562 #endif
563 
564 /// Mark a chain's successors as having one fewer preds.
565 ///
566 /// When a chain is being merged into the "placed" chain, this routine will
567 /// quickly walk the successors of each block in the chain and mark them as
568 /// having one fewer active predecessor. It also adds any successors of this
569 /// chain which reach the zero-predecessor state to the appropriate worklist.
570 void MachineBlockPlacement::markChainSuccessors(
571  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
572  const BlockFilterSet *BlockFilter) {
573  // Walk all the blocks in this chain, marking their successors as having
574  // a predecessor placed.
575  for (MachineBasicBlock *MBB : Chain) {
576  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
577  }
578 }
579 
580 /// Mark a single block's successors as having one fewer preds.
581 ///
582 /// Under normal circumstances, this is only called by markChainSuccessors,
583 /// but if a block that was to be placed is completely tail-duplicated away,
584 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
585 /// for just that block.
586 void MachineBlockPlacement::markBlockSuccessors(
587  const BlockChain &Chain, const MachineBasicBlock *MBB,
588  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
589  // Add any successors for which this is the only un-placed in-loop
590  // predecessor to the worklist as a viable candidate for CFG-neutral
591  // placement. No subsequent placement of this block will violate the CFG
592  // shape, so we get to use heuristics to choose a favorable placement.
593  for (MachineBasicBlock *Succ : MBB->successors()) {
594  if (BlockFilter && !BlockFilter->count(Succ))
595  continue;
596  BlockChain &SuccChain = *BlockToChain[Succ];
597  // Disregard edges within a fixed chain, or edges to the loop header.
598  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
599  continue;
600 
601  // This is a cross-chain edge that is within the loop, so decrement the
602  // loop predecessor count of the destination chain.
603  if (SuccChain.UnscheduledPredecessors == 0 ||
604  --SuccChain.UnscheduledPredecessors > 0)
605  continue;
606 
607  auto *NewBB = *SuccChain.begin();
608  if (NewBB->isEHPad())
609  EHPadWorkList.push_back(NewBB);
610  else
611  BlockWorkList.push_back(NewBB);
612  }
613 }
614 
615 /// This helper function collects the set of successors of block
616 /// \p BB that are allowed to be its layout successors, and return
617 /// the total branch probability of edges from \p BB to those
618 /// blocks.
619 BranchProbability MachineBlockPlacement::collectViableSuccessors(
620  const MachineBasicBlock *BB, const BlockChain &Chain,
621  const BlockFilterSet *BlockFilter,
623  // Adjust edge probabilities by excluding edges pointing to blocks that is
624  // either not in BlockFilter or is already in the current chain. Consider the
625  // following CFG:
626  //
627  // --->A
628  // | / \
629  // | B C
630  // | \ / \
631  // ----D E
632  //
633  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
634  // A->C is chosen as a fall-through, D won't be selected as a successor of C
635  // due to CFG constraint (the probability of C->D is not greater than
636  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
637  // when calculating the probability of C->D, D will be selected and we
638  // will get A C D B as the layout of this loop.
639  auto AdjustedSumProb = BranchProbability::getOne();
640  for (MachineBasicBlock *Succ : BB->successors()) {
641  bool SkipSucc = false;
642  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
643  SkipSucc = true;
644  } else {
645  BlockChain *SuccChain = BlockToChain[Succ];
646  if (SuccChain == &Chain) {
647  SkipSucc = true;
648  } else if (Succ != *SuccChain->begin()) {
649  LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
650  << " -> Mid chain!\n");
651  continue;
652  }
653  }
654  if (SkipSucc)
655  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
656  else
657  Successors.push_back(Succ);
658  }
659 
660  return AdjustedSumProb;
661 }
662 
663 /// The helper function returns the branch probability that is adjusted
664 /// or normalized over the new total \p AdjustedSumProb.
665 static BranchProbability
667  BranchProbability AdjustedSumProb) {
668  BranchProbability SuccProb;
669  uint32_t SuccProbN = OrigProb.getNumerator();
670  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
671  if (SuccProbN >= SuccProbD)
672  SuccProb = BranchProbability::getOne();
673  else
674  SuccProb = BranchProbability(SuccProbN, SuccProbD);
675 
676  return SuccProb;
677 }
678 
679 /// Check if \p BB has exactly the successors in \p Successors.
680 static bool
683  if (BB.succ_size() != Successors.size())
684  return false;
685  // We don't want to count self-loops
686  if (Successors.count(&BB))
687  return false;
688  for (MachineBasicBlock *Succ : BB.successors())
689  if (!Successors.count(Succ))
690  return false;
691  return true;
692 }
693 
694 /// Check if a block should be tail duplicated to increase fallthrough
695 /// opportunities.
696 /// \p BB Block to check.
697 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
698  // Blocks with single successors don't create additional fallthrough
699  // opportunities. Don't duplicate them. TODO: When conditional exits are
700  // analyzable, allow them to be duplicated.
701  bool IsSimple = TailDup.isSimpleBB(BB);
702 
703  if (BB->succ_size() == 1)
704  return false;
705  return TailDup.shouldTailDuplicate(IsSimple, *BB);
706 }
707 
708 /// Compare 2 BlockFrequency's with a small penalty for \p A.
709 /// In order to be conservative, we apply a X% penalty to account for
710 /// increased icache pressure and static heuristics. For small frequencies
711 /// we use only the numerators to improve accuracy. For simplicity, we assume the
712 /// penalty is less than 100%
713 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
715  uint64_t EntryFreq) {
716  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
717  BlockFrequency Gain = A - B;
718  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
719 }
720 
721 /// Check the edge frequencies to see if tail duplication will increase
722 /// fallthroughs. It only makes sense to call this function when
723 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
724 /// always locally profitable if we would have picked \p Succ without
725 /// considering duplication.
726 bool MachineBlockPlacement::isProfitableToTailDup(
727  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
728  BranchProbability QProb,
729  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
730  // We need to do a probability calculation to make sure this is profitable.
731  // First: does succ have a successor that post-dominates? This affects the
732  // calculation. The 2 relevant cases are:
733  // BB BB
734  // | \Qout | \Qout
735  // P| C |P C
736  // = C' = C'
737  // | /Qin | /Qin
738  // | / | /
739  // Succ Succ
740  // / \ | \ V
741  // U/ =V |U \
742  // / \ = D
743  // D E | /
744  // | /
745  // |/
746  // PDom
747  // '=' : Branch taken for that CFG edge
748  // In the second case, Placing Succ while duplicating it into C prevents the
749  // fallthrough of Succ into either D or PDom, because they now have C as an
750  // unplaced predecessor
751 
752  // Start by figuring out which case we fall into
753  MachineBasicBlock *PDom = nullptr;
755  // Only scan the relevant successors
756  auto AdjustedSuccSumProb =
757  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
758  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
759  auto BBFreq = MBFI->getBlockFreq(BB);
760  auto SuccFreq = MBFI->getBlockFreq(Succ);
761  BlockFrequency P = BBFreq * PProb;
762  BlockFrequency Qout = BBFreq * QProb;
763  uint64_t EntryFreq = MBFI->getEntryFreq();
764  // If there are no more successors, it is profitable to copy, as it strictly
765  // increases fallthrough.
766  if (SuccSuccs.size() == 0)
767  return greaterWithBias(P, Qout, EntryFreq);
768 
769  auto BestSuccSucc = BranchProbability::getZero();
770  // Find the PDom or the best Succ if no PDom exists.
771  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
772  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
773  if (Prob > BestSuccSucc)
774  BestSuccSucc = Prob;
775  if (PDom == nullptr)
776  if (MPDT->dominates(SuccSucc, Succ)) {
777  PDom = SuccSucc;
778  break;
779  }
780  }
781  // For the comparisons, we need to know Succ's best incoming edge that isn't
782  // from BB.
783  auto SuccBestPred = BlockFrequency(0);
784  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
785  if (SuccPred == Succ || SuccPred == BB
786  || BlockToChain[SuccPred] == &Chain
787  || (BlockFilter && !BlockFilter->count(SuccPred)))
788  continue;
789  auto Freq = MBFI->getBlockFreq(SuccPred)
790  * MBPI->getEdgeProbability(SuccPred, Succ);
791  if (Freq > SuccBestPred)
792  SuccBestPred = Freq;
793  }
794  // Qin is Succ's best unplaced incoming edge that isn't BB
795  BlockFrequency Qin = SuccBestPred;
796  // If it doesn't have a post-dominating successor, here is the calculation:
797  // BB BB
798  // | \Qout | \
799  // P| C | =
800  // = C' | C
801  // | /Qin | |
802  // | / | C' (+Succ)
803  // Succ Succ /|
804  // / \ | \/ |
805  // U/ =V | == |
806  // / \ | / \|
807  // D E D E
808  // '=' : Branch taken for that CFG edge
809  // Cost in the first case is: P + V
810  // For this calculation, we always assume P > Qout. If Qout > P
811  // The result of this function will be ignored at the caller.
812  // Let F = SuccFreq - Qin
813  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
814 
815  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
816  BranchProbability UProb = BestSuccSucc;
817  BranchProbability VProb = AdjustedSuccSumProb - UProb;
818  BlockFrequency F = SuccFreq - Qin;
819  BlockFrequency V = SuccFreq * VProb;
820  BlockFrequency QinU = std::min(Qin, F) * UProb;
821  BlockFrequency BaseCost = P + V;
822  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
823  return greaterWithBias(BaseCost, DupCost, EntryFreq);
824  }
825  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
826  BranchProbability VProb = AdjustedSuccSumProb - UProb;
827  BlockFrequency U = SuccFreq * UProb;
828  BlockFrequency V = SuccFreq * VProb;
829  BlockFrequency F = SuccFreq - Qin;
830  // If there is a post-dominating successor, here is the calculation:
831  // BB BB BB BB
832  // | \Qout | \ | \Qout | \
833  // |P C | = |P C | =
834  // = C' |P C = C' |P C
835  // | /Qin | | | /Qin | |
836  // | / | C' (+Succ) | / | C' (+Succ)
837  // Succ Succ /| Succ Succ /|
838  // | \ V | \/ | | \ V | \/ |
839  // |U \ |U /\ =? |U = |U /\ |
840  // = D = = =?| | D | = =|
841  // | / |/ D | / |/ D
842  // | / | / | = | /
843  // |/ | / |/ | =
844  // Dom Dom Dom Dom
845  // '=' : Branch taken for that CFG edge
846  // The cost for taken branches in the first case is P + U
847  // Let F = SuccFreq - Qin
848  // The cost in the second case (assuming independence), given the layout:
849  // BB, Succ, (C+Succ), D, Dom or the layout:
850  // BB, Succ, D, Dom, (C+Succ)
851  // is Qout + max(F, Qin) * U + min(F, Qin)
852  // compare P + U vs Qout + P * U + Qin.
853  //
854  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
855  //
856  // For the 3rd case, the cost is P + 2 * V
857  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
858  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
859  if (UProb > AdjustedSuccSumProb / 2 &&
860  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
861  Chain, BlockFilter))
862  // Cases 3 & 4
863  return greaterWithBias(
864  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
865  EntryFreq);
866  // Cases 1 & 2
867  return greaterWithBias((P + U),
868  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
869  std::max(Qin, F) * UProb),
870  EntryFreq);
871 }
872 
873 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
874 /// successors form the lower part of a trellis. A successor set S forms the
875 /// lower part of a trellis if all of the predecessors of S are either in S or
876 /// have all of S as successors. We ignore trellises where BB doesn't have 2
877 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
878 /// are very uncommon and complex to compute optimally. Allowing edges within S
879 /// is not strictly a trellis, but the same algorithm works, so we allow it.
880 bool MachineBlockPlacement::isTrellis(
881  const MachineBasicBlock *BB,
882  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
883  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
884  // Technically BB could form a trellis with branching factor higher than 2.
885  // But that's extremely uncommon.
886  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
887  return false;
888 
890  BB->succ_end());
891  // To avoid reviewing the same predecessors twice.
893 
894  for (MachineBasicBlock *Succ : ViableSuccs) {
895  int PredCount = 0;
896  for (auto SuccPred : Succ->predecessors()) {
897  // Allow triangle successors, but don't count them.
898  if (Successors.count(SuccPred)) {
899  // Make sure that it is actually a triangle.
900  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
901  if (!Successors.count(CheckSucc))
902  return false;
903  continue;
904  }
905  const BlockChain *PredChain = BlockToChain[SuccPred];
906  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
907  PredChain == &Chain || PredChain == BlockToChain[Succ])
908  continue;
909  ++PredCount;
910  // Perform the successor check only once.
911  if (!SeenPreds.insert(SuccPred).second)
912  continue;
913  if (!hasSameSuccessors(*SuccPred, Successors))
914  return false;
915  }
916  // If one of the successors has only BB as a predecessor, it is not a
917  // trellis.
918  if (PredCount < 1)
919  return false;
920  }
921  return true;
922 }
923 
924 /// Pick the highest total weight pair of edges that can both be laid out.
925 /// The edges in \p Edges[0] are assumed to have a different destination than
926 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
927 /// the individual highest weight edges to the 2 different destinations, or in
928 /// case of a conflict, one of them should be replaced with a 2nd best edge.
929 std::pair<MachineBlockPlacement::WeightedEdge,
930  MachineBlockPlacement::WeightedEdge>
931 MachineBlockPlacement::getBestNonConflictingEdges(
932  const MachineBasicBlock *BB,
934  Edges) {
935  // Sort the edges, and then for each successor, find the best incoming
936  // predecessor. If the best incoming predecessors aren't the same,
937  // then that is clearly the best layout. If there is a conflict, one of the
938  // successors will have to fallthrough from the second best predecessor. We
939  // compare which combination is better overall.
940 
941  // Sort for highest frequency.
942  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
943 
944  std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
945  std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
946  auto BestA = Edges[0].begin();
947  auto BestB = Edges[1].begin();
948  // Arrange for the correct answer to be in BestA and BestB
949  // If the 2 best edges don't conflict, the answer is already there.
950  if (BestA->Src == BestB->Src) {
951  // Compare the total fallthrough of (Best + Second Best) for both pairs
952  auto SecondBestA = std::next(BestA);
953  auto SecondBestB = std::next(BestB);
954  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
955  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
956  if (BestAScore < BestBScore)
957  BestA = SecondBestA;
958  else
959  BestB = SecondBestB;
960  }
961  // Arrange for the BB edge to be in BestA if it exists.
962  if (BestB->Src == BB)
963  std::swap(BestA, BestB);
964  return std::make_pair(*BestA, *BestB);
965 }
966 
967 /// Get the best successor from \p BB based on \p BB being part of a trellis.
968 /// We only handle trellises with 2 successors, so the algorithm is
969 /// straightforward: Find the best pair of edges that don't conflict. We find
970 /// the best incoming edge for each successor in the trellis. If those conflict,
971 /// we consider which of them should be replaced with the second best.
972 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
973 /// comes from \p BB, it will be in \p BestEdges[0]
974 MachineBlockPlacement::BlockAndTailDupResult
975 MachineBlockPlacement::getBestTrellisSuccessor(
976  const MachineBasicBlock *BB,
977  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
978  BranchProbability AdjustedSumProb, const BlockChain &Chain,
979  const BlockFilterSet *BlockFilter) {
980 
981  BlockAndTailDupResult Result = {nullptr, false};
983  BB->succ_end());
984 
985  // We assume size 2 because it's common. For general n, we would have to do
986  // the Hungarian algorithm, but it's not worth the complexity because more
987  // than 2 successors is fairly uncommon, and a trellis even more so.
988  if (Successors.size() != 2 || ViableSuccs.size() != 2)
989  return Result;
990 
991  // Collect the edge frequencies of all edges that form the trellis.
993  int SuccIndex = 0;
994  for (auto Succ : ViableSuccs) {
995  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
996  // Skip any placed predecessors that are not BB
997  if (SuccPred != BB)
998  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
999  BlockToChain[SuccPred] == &Chain ||
1000  BlockToChain[SuccPred] == BlockToChain[Succ])
1001  continue;
1002  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1003  MBPI->getEdgeProbability(SuccPred, Succ);
1004  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1005  }
1006  ++SuccIndex;
1007  }
1008 
1009  // Pick the best combination of 2 edges from all the edges in the trellis.
1010  WeightedEdge BestA, BestB;
1011  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1012 
1013  if (BestA.Src != BB) {
1014  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1015  // we shouldn't choose any successor. We've already looked and there's a
1016  // better fallthrough edge for all the successors.
1017  LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1018  return Result;
1019  }
1020 
1021  // Did we pick the triangle edge? If tail-duplication is profitable, do
1022  // that instead. Otherwise merge the triangle edge now while we know it is
1023  // optimal.
1024  if (BestA.Dest == BestB.Src) {
1025  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1026  // would be better.
1027  MachineBasicBlock *Succ1 = BestA.Dest;
1028  MachineBasicBlock *Succ2 = BestB.Dest;
1029  // Check to see if tail-duplication would be profitable.
1030  if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1031  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1032  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1033  Chain, BlockFilter)) {
1035  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1036  dbgs() << " Selected: " << getBlockName(Succ2)
1037  << ", probability: " << Succ2Prob
1038  << " (Tail Duplicate)\n");
1039  Result.BB = Succ2;
1040  Result.ShouldTailDup = true;
1041  return Result;
1042  }
1043  }
1044  // We have already computed the optimal edge for the other side of the
1045  // trellis.
1046  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1047 
1048  auto TrellisSucc = BestA.Dest;
1050  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1051  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1052  << ", probability: " << SuccProb << " (Trellis)\n");
1053  Result.BB = TrellisSucc;
1054  return Result;
1055 }
1056 
1057 /// When the option allowTailDupPlacement() is on, this method checks if the
1058 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1059 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1060 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1061  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1062  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1063  if (!shouldTailDuplicate(Succ))
1064  return false;
1065 
1066  // For CFG checking.
1068  BB->succ_end());
1069  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1070  // Make sure all unplaced and unfiltered predecessors can be
1071  // tail-duplicated into.
1072  // Skip any blocks that are already placed or not in this loop.
1073  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1074  || BlockToChain[Pred] == &Chain)
1075  continue;
1076  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1077  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1078  // This will result in a trellis after tail duplication, so we don't
1079  // need to copy Succ into this predecessor. In the presence
1080  // of a trellis tail duplication can continue to be profitable.
1081  // For example:
1082  // A A
1083  // |\ |\
1084  // | \ | \
1085  // | C | C+BB
1086  // | / | |
1087  // |/ | |
1088  // BB => BB |
1089  // |\ |\/|
1090  // | \ |/\|
1091  // | D | D
1092  // | / | /
1093  // |/ |/
1094  // Succ Succ
1095  //
1096  // After BB was duplicated into C, the layout looks like the one on the
1097  // right. BB and C now have the same successors. When considering
1098  // whether Succ can be duplicated into all its unplaced predecessors, we
1099  // ignore C.
1100  // We can do this because C already has a profitable fallthrough, namely
1101  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1102  // duplication and for this test.
1103  //
1104  // This allows trellises to be laid out in 2 separate chains
1105  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1106  // because it allows the creation of 2 fallthrough paths with links
1107  // between them, and we correctly identify the best layout for these
1108  // CFGs. We want to extend trellises that the user created in addition
1109  // to trellises created by tail-duplication, so we just look for the
1110  // CFG.
1111  continue;
1112  return false;
1113  }
1114  }
1115  return true;
1116 }
1117 
1118 /// Find chains of triangles where we believe it would be profitable to
1119 /// tail-duplicate them all, but a local analysis would not find them.
1120 /// There are 3 ways this can be profitable:
1121 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1122 /// longer chains)
1123 /// 2) The chains are statically correlated. Branch probabilities have a very
1124 /// U-shaped distribution.
1125 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1126 /// If the branches in a chain are likely to be from the same side of the
1127 /// distribution as their predecessor, but are independent at runtime, this
1128 /// transformation is profitable. (Because the cost of being wrong is a small
1129 /// fixed cost, unlike the standard triangle layout where the cost of being
1130 /// wrong scales with the # of triangles.)
1131 /// 3) The chains are dynamically correlated. If the probability that a previous
1132 /// branch was taken positively influences whether the next branch will be
1133 /// taken
1134 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1135 void MachineBlockPlacement::precomputeTriangleChains() {
1136  struct TriangleChain {
1137  std::vector<MachineBasicBlock *> Edges;
1138 
1139  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1140  : Edges({src, dst}) {}
1141 
1142  void append(MachineBasicBlock *dst) {
1143  assert(getKey()->isSuccessor(dst) &&
1144  "Attempting to append a block that is not a successor.");
1145  Edges.push_back(dst);
1146  }
1147 
1148  unsigned count() const { return Edges.size() - 1; }
1149 
1150  MachineBasicBlock *getKey() const {
1151  return Edges.back();
1152  }
1153  };
1154 
1155  if (TriangleChainCount == 0)
1156  return;
1157 
1158  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1159  // Map from last block to the chain that contains it. This allows us to extend
1160  // chains as we find new triangles.
1162  for (MachineBasicBlock &BB : *F) {
1163  // If BB doesn't have 2 successors, it doesn't start a triangle.
1164  if (BB.succ_size() != 2)
1165  continue;
1166  MachineBasicBlock *PDom = nullptr;
1167  for (MachineBasicBlock *Succ : BB.successors()) {
1168  if (!MPDT->dominates(Succ, &BB))
1169  continue;
1170  PDom = Succ;
1171  break;
1172  }
1173  // If BB doesn't have a post-dominating successor, it doesn't form a
1174  // triangle.
1175  if (PDom == nullptr)
1176  continue;
1177  // If PDom has a hint that it is low probability, skip this triangle.
1178  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1179  continue;
1180  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1181  // we're looking for.
1182  if (!shouldTailDuplicate(PDom))
1183  continue;
1184  bool CanTailDuplicate = true;
1185  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1186  // isn't the kind of triangle we're looking for.
1187  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1188  if (Pred == &BB)
1189  continue;
1190  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1191  CanTailDuplicate = false;
1192  break;
1193  }
1194  }
1195  // If we can't tail-duplicate PDom to its predecessors, then skip this
1196  // triangle.
1197  if (!CanTailDuplicate)
1198  continue;
1199 
1200  // Now we have an interesting triangle. Insert it if it's not part of an
1201  // existing chain.
1202  // Note: This cannot be replaced with a call insert() or emplace() because
1203  // the find key is BB, but the insert/emplace key is PDom.
1204  auto Found = TriangleChainMap.find(&BB);
1205  // If it is, remove the chain from the map, grow it, and put it back in the
1206  // map with the end as the new key.
1207  if (Found != TriangleChainMap.end()) {
1208  TriangleChain Chain = std::move(Found->second);
1209  TriangleChainMap.erase(Found);
1210  Chain.append(PDom);
1211  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1212  } else {
1213  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1214  assert(InsertResult.second && "Block seen twice.");
1215  (void)InsertResult;
1216  }
1217  }
1218 
1219  // Iterating over a DenseMap is safe here, because the only thing in the body
1220  // of the loop is inserting into another DenseMap (ComputedEdges).
1221  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1222  for (auto &ChainPair : TriangleChainMap) {
1223  TriangleChain &Chain = ChainPair.second;
1224  // Benchmarking has shown that due to branch correlation duplicating 2 or
1225  // more triangles is profitable, despite the calculations assuming
1226  // independence.
1227  if (Chain.count() < TriangleChainCount)
1228  continue;
1229  MachineBasicBlock *dst = Chain.Edges.back();
1230  Chain.Edges.pop_back();
1231  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1232  LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1233  << getBlockName(dst)
1234  << " as pre-computed based on triangles.\n");
1235 
1236  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1237  assert(InsertResult.second && "Block seen twice.");
1238  (void)InsertResult;
1239 
1240  dst = src;
1241  }
1242  }
1243 }
1244 
1245 // When profile is not present, return the StaticLikelyProb.
1246 // When profile is available, we need to handle the triangle-shape CFG.
1248  const MachineBasicBlock *BB) {
1249  if (!BB->getParent()->getFunction().hasProfileData())
1250  return BranchProbability(StaticLikelyProb, 100);
1251  if (BB->succ_size() == 2) {
1252  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1253  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1254  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1255  /* See case 1 below for the cost analysis. For BB->Succ to
1256  * be taken with smaller cost, the following needs to hold:
1257  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1258  * So the threshold T in the calculation below
1259  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1260  * So T / (1 - T) = 2, Yielding T = 2/3
1261  * Also adding user specified branch bias, we have
1262  * T = (2/3)*(ProfileLikelyProb/50)
1263  * = (2*ProfileLikelyProb)/150)
1264  */
1265  return BranchProbability(2 * ProfileLikelyProb, 150);
1266  }
1267  }
1268  return BranchProbability(ProfileLikelyProb, 100);
1269 }
1270 
1271 /// Checks to see if the layout candidate block \p Succ has a better layout
1272 /// predecessor than \c BB. If yes, returns true.
1273 /// \p SuccProb: The probability adjusted for only remaining blocks.
1274 /// Only used for logging
1275 /// \p RealSuccProb: The un-adjusted probability.
1276 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1277 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1278 /// considered
1279 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1280  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1281  const BlockChain &SuccChain, BranchProbability SuccProb,
1282  BranchProbability RealSuccProb, const BlockChain &Chain,
1283  const BlockFilterSet *BlockFilter) {
1284 
1285  // There isn't a better layout when there are no unscheduled predecessors.
1286  if (SuccChain.UnscheduledPredecessors == 0)
1287  return false;
1288 
1289  // There are two basic scenarios here:
1290  // -------------------------------------
1291  // Case 1: triangular shape CFG (if-then):
1292  // BB
1293  // | \
1294  // | \
1295  // | Pred
1296  // | /
1297  // Succ
1298  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1299  // set Succ as the layout successor of BB. Picking Succ as BB's
1300  // successor breaks the CFG constraints (FIXME: define these constraints).
1301  // With this layout, Pred BB
1302  // is forced to be outlined, so the overall cost will be cost of the
1303  // branch taken from BB to Pred, plus the cost of back taken branch
1304  // from Pred to Succ, as well as the additional cost associated
1305  // with the needed unconditional jump instruction from Pred To Succ.
1306 
1307  // The cost of the topological order layout is the taken branch cost
1308  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1309  // must hold:
1310  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1311  // < freq(BB->Succ) * taken_branch_cost.
1312  // Ignoring unconditional jump cost, we get
1313  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1314  // prob(BB->Succ) > 2 * prob(BB->Pred)
1315  //
1316  // When real profile data is available, we can precisely compute the
1317  // probability threshold that is needed for edge BB->Succ to be considered.
1318  // Without profile data, the heuristic requires the branch bias to be
1319  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1320  // -----------------------------------------------------------------
1321  // Case 2: diamond like CFG (if-then-else):
1322  // S
1323  // / \
1324  // | \
1325  // BB Pred
1326  // \ /
1327  // Succ
1328  // ..
1329  //
1330  // The current block is BB and edge BB->Succ is now being evaluated.
1331  // Note that edge S->BB was previously already selected because
1332  // prob(S->BB) > prob(S->Pred).
1333  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1334  // choose Pred, we will have a topological ordering as shown on the left
1335  // in the picture below. If we choose Succ, we have the solution as shown
1336  // on the right:
1337  //
1338  // topo-order:
1339  //
1340  // S----- ---S
1341  // | | | |
1342  // ---BB | | BB
1343  // | | | |
1344  // | Pred-- | Succ--
1345  // | | | |
1346  // ---Succ ---Pred--
1347  //
1348  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1349  // = freq(S->Pred) + freq(S->BB)
1350  //
1351  // If we have profile data (i.e, branch probabilities can be trusted), the
1352  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1353  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1354  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1355  // means the cost of topological order is greater.
1356  // When profile data is not available, however, we need to be more
1357  // conservative. If the branch prediction is wrong, breaking the topo-order
1358  // will actually yield a layout with large cost. For this reason, we need
1359  // strong biased branch at block S with Prob(S->BB) in order to select
1360  // BB->Succ. This is equivalent to looking the CFG backward with backward
1361  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1362  // profile data).
1363  // --------------------------------------------------------------------------
1364  // Case 3: forked diamond
1365  // S
1366  // / \
1367  // / \
1368  // BB Pred
1369  // | \ / |
1370  // | \ / |
1371  // | X |
1372  // | / \ |
1373  // | / \ |
1374  // S1 S2
1375  //
1376  // The current block is BB and edge BB->S1 is now being evaluated.
1377  // As above S->BB was already selected because
1378  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1379  //
1380  // topo-order:
1381  //
1382  // S-------| ---S
1383  // | | | |
1384  // ---BB | | BB
1385  // | | | |
1386  // | Pred----| | S1----
1387  // | | | |
1388  // --(S1 or S2) ---Pred--
1389  // |
1390  // S2
1391  //
1392  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1393  // + min(freq(Pred->S1), freq(Pred->S2))
1394  // Non-topo-order cost:
1395  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1396  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1397  // is 0. Then the non topo layout is better when
1398  // freq(S->Pred) < freq(BB->S1).
1399  // This is exactly what is checked below.
1400  // Note there are other shapes that apply (Pred may not be a single block,
1401  // but they all fit this general pattern.)
1403 
1404  // Make sure that a hot successor doesn't have a globally more
1405  // important predecessor.
1406  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1407  bool BadCFGConflict = false;
1408 
1409  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1410  if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
1411  (BlockFilter && !BlockFilter->count(Pred)) ||
1412  BlockToChain[Pred] == &Chain ||
1413  // This check is redundant except for look ahead. This function is
1414  // called for lookahead by isProfitableToTailDup when BB hasn't been
1415  // placed yet.
1416  (Pred == BB))
1417  continue;
1418  // Do backward checking.
1419  // For all cases above, we need a backward checking to filter out edges that
1420  // are not 'strongly' biased.
1421  // BB Pred
1422  // \ /
1423  // Succ
1424  // We select edge BB->Succ if
1425  // freq(BB->Succ) > freq(Succ) * HotProb
1426  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1427  // HotProb
1428  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1429  // Case 1 is covered too, because the first equation reduces to:
1430  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1431  BlockFrequency PredEdgeFreq =
1432  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1433  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1434  BadCFGConflict = true;
1435  break;
1436  }
1437  }
1438 
1439  if (BadCFGConflict) {
1440  LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1441  << SuccProb << " (prob) (non-cold CFG conflict)\n");
1442  return true;
1443  }
1444 
1445  return false;
1446 }
1447 
1448 /// Select the best successor for a block.
1449 ///
1450 /// This looks across all successors of a particular block and attempts to
1451 /// select the "best" one to be the layout successor. It only considers direct
1452 /// successors which also pass the block filter. It will attempt to avoid
1453 /// breaking CFG structure, but cave and break such structures in the case of
1454 /// very hot successor edges.
1455 ///
1456 /// \returns The best successor block found, or null if none are viable, along
1457 /// with a boolean indicating if tail duplication is necessary.
1458 MachineBlockPlacement::BlockAndTailDupResult
1459 MachineBlockPlacement::selectBestSuccessor(
1460  const MachineBasicBlock *BB, const BlockChain &Chain,
1461  const BlockFilterSet *BlockFilter) {
1462  const BranchProbability HotProb(StaticLikelyProb, 100);
1463 
1464  BlockAndTailDupResult BestSucc = { nullptr, false };
1465  auto BestProb = BranchProbability::getZero();
1466 
1468  auto AdjustedSumProb =
1469  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1470 
1471  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1472  << "\n");
1473 
1474  // if we already precomputed the best successor for BB, return that if still
1475  // applicable.
1476  auto FoundEdge = ComputedEdges.find(BB);
1477  if (FoundEdge != ComputedEdges.end()) {
1478  MachineBasicBlock *Succ = FoundEdge->second.BB;
1479  ComputedEdges.erase(FoundEdge);
1480  BlockChain *SuccChain = BlockToChain[Succ];
1481  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1482  SuccChain != &Chain && Succ == *SuccChain->begin())
1483  return FoundEdge->second;
1484  }
1485 
1486  // if BB is part of a trellis, Use the trellis to determine the optimal
1487  // fallthrough edges
1488  if (isTrellis(BB, Successors, Chain, BlockFilter))
1489  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1490  BlockFilter);
1491 
1492  // For blocks with CFG violations, we may be able to lay them out anyway with
1493  // tail-duplication. We keep this vector so we can perform the probability
1494  // calculations the minimum number of times.
1496  DupCandidates;
1497  for (MachineBasicBlock *Succ : Successors) {
1498  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1499  BranchProbability SuccProb =
1500  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1501 
1502  BlockChain &SuccChain = *BlockToChain[Succ];
1503  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1504  // predecessor that yields lower global cost.
1505  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1506  Chain, BlockFilter)) {
1507  // If tail duplication would make Succ profitable, place it.
1508  if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1509  DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
1510  continue;
1511  }
1512 
1513  LLVM_DEBUG(
1514  dbgs() << " Candidate: " << getBlockName(Succ)
1515  << ", probability: " << SuccProb
1516  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1517  << "\n");
1518 
1519  if (BestSucc.BB && BestProb >= SuccProb) {
1520  LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1521  continue;
1522  }
1523 
1524  LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1525  BestSucc.BB = Succ;
1526  BestProb = SuccProb;
1527  }
1528  // Handle the tail duplication candidates in order of decreasing probability.
1529  // Stop at the first one that is profitable. Also stop if they are less
1530  // profitable than BestSucc. Position is important because we preserve it and
1531  // prefer first best match. Here we aren't comparing in order, so we capture
1532  // the position instead.
1533  if (DupCandidates.size() != 0) {
1534  auto cmp =
1535  [](const std::tuple<BranchProbability, MachineBasicBlock *> &a,
1536  const std::tuple<BranchProbability, MachineBasicBlock *> &b) {
1537  return std::get<0>(a) > std::get<0>(b);
1538  };
1539  std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp);
1540  }
1541  for(auto &Tup : DupCandidates) {
1542  BranchProbability DupProb;
1543  MachineBasicBlock *Succ;
1544  std::tie(DupProb, Succ) = Tup;
1545  if (DupProb < BestProb)
1546  break;
1547  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1548  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1549  LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1550  << ", probability: " << DupProb
1551  << " (Tail Duplicate)\n");
1552  BestSucc.BB = Succ;
1553  BestSucc.ShouldTailDup = true;
1554  break;
1555  }
1556  }
1557 
1558  if (BestSucc.BB)
1559  LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1560 
1561  return BestSucc;
1562 }
1563 
1564 /// Select the best block from a worklist.
1565 ///
1566 /// This looks through the provided worklist as a list of candidate basic
1567 /// blocks and select the most profitable one to place. The definition of
1568 /// profitable only really makes sense in the context of a loop. This returns
1569 /// the most frequently visited block in the worklist, which in the case of
1570 /// a loop, is the one most desirable to be physically close to the rest of the
1571 /// loop body in order to improve i-cache behavior.
1572 ///
1573 /// \returns The best block found, or null if none are viable.
1574 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1575  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1576  // Once we need to walk the worklist looking for a candidate, cleanup the
1577  // worklist of already placed entries.
1578  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1579  // some code complexity) into the loop below.
1580  WorkList.erase(llvm::remove_if(WorkList,
1581  [&](MachineBasicBlock *BB) {
1582  return BlockToChain.lookup(BB) == &Chain;
1583  }),
1584  WorkList.end());
1585 
1586  if (WorkList.empty())
1587  return nullptr;
1588 
1589  bool IsEHPad = WorkList[0]->isEHPad();
1590 
1591  MachineBasicBlock *BestBlock = nullptr;
1592  BlockFrequency BestFreq;
1593  for (MachineBasicBlock *MBB : WorkList) {
1594  assert(MBB->isEHPad() == IsEHPad &&
1595  "EHPad mismatch between block and work list.");
1596 
1597  BlockChain &SuccChain = *BlockToChain[MBB];
1598  if (&SuccChain == &Chain)
1599  continue;
1600 
1601  assert(SuccChain.UnscheduledPredecessors == 0 &&
1602  "Found CFG-violating block");
1603 
1604  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1605  LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1606  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1607 
1608  // For ehpad, we layout the least probable first as to avoid jumping back
1609  // from least probable landingpads to more probable ones.
1610  //
1611  // FIXME: Using probability is probably (!) not the best way to achieve
1612  // this. We should probably have a more principled approach to layout
1613  // cleanup code.
1614  //
1615  // The goal is to get:
1616  //
1617  // +--------------------------+
1618  // | V
1619  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1620  //
1621  // Rather than:
1622  //
1623  // +-------------------------------------+
1624  // V |
1625  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1626  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1627  continue;
1628 
1629  BestBlock = MBB;
1630  BestFreq = CandidateFreq;
1631  }
1632 
1633  return BestBlock;
1634 }
1635 
1636 /// Retrieve the first unplaced basic block.
1637 ///
1638 /// This routine is called when we are unable to use the CFG to walk through
1639 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1640 /// We walk through the function's blocks in order, starting from the
1641 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1642 /// re-scanning the entire sequence on repeated calls to this routine.
1643 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1644  const BlockChain &PlacedChain,
1645  MachineFunction::iterator &PrevUnplacedBlockIt,
1646  const BlockFilterSet *BlockFilter) {
1647  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1648  ++I) {
1649  if (BlockFilter && !BlockFilter->count(&*I))
1650  continue;
1651  if (BlockToChain[&*I] != &PlacedChain) {
1652  PrevUnplacedBlockIt = I;
1653  // Now select the head of the chain to which the unplaced block belongs
1654  // as the block to place. This will force the entire chain to be placed,
1655  // and satisfies the requirements of merging chains.
1656  return *BlockToChain[&*I]->begin();
1657  }
1658  }
1659  return nullptr;
1660 }
1661 
1662 void MachineBlockPlacement::fillWorkLists(
1663  const MachineBasicBlock *MBB,
1664  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1665  const BlockFilterSet *BlockFilter = nullptr) {
1666  BlockChain &Chain = *BlockToChain[MBB];
1667  if (!UpdatedPreds.insert(&Chain).second)
1668  return;
1669 
1670  assert(
1671  Chain.UnscheduledPredecessors == 0 &&
1672  "Attempting to place block with unscheduled predecessors in worklist.");
1673  for (MachineBasicBlock *ChainBB : Chain) {
1674  assert(BlockToChain[ChainBB] == &Chain &&
1675  "Block in chain doesn't match BlockToChain map.");
1676  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1677  if (BlockFilter && !BlockFilter->count(Pred))
1678  continue;
1679  if (BlockToChain[Pred] == &Chain)
1680  continue;
1681  ++Chain.UnscheduledPredecessors;
1682  }
1683  }
1684 
1685  if (Chain.UnscheduledPredecessors != 0)
1686  return;
1687 
1688  MachineBasicBlock *BB = *Chain.begin();
1689  if (BB->isEHPad())
1690  EHPadWorkList.push_back(BB);
1691  else
1692  BlockWorkList.push_back(BB);
1693 }
1694 
1695 void MachineBlockPlacement::buildChain(
1696  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1697  BlockFilterSet *BlockFilter) {
1698  assert(HeadBB && "BB must not be null.\n");
1699  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1700  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1701 
1702  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1703  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1704  MachineBasicBlock *BB = *std::prev(Chain.end());
1705  while (true) {
1706  assert(BB && "null block found at end of chain in loop.");
1707  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1708  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1709 
1710 
1711  // Look for the best viable successor if there is one to place immediately
1712  // after this block.
1713  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1714  MachineBasicBlock* BestSucc = Result.BB;
1715  bool ShouldTailDup = Result.ShouldTailDup;
1716  if (allowTailDupPlacement())
1717  ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
1718 
1719  // If an immediate successor isn't available, look for the best viable
1720  // block among those we've identified as not violating the loop's CFG at
1721  // this point. This won't be a fallthrough, but it will increase locality.
1722  if (!BestSucc)
1723  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1724  if (!BestSucc)
1725  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1726 
1727  if (!BestSucc) {
1728  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1729  if (!BestSucc)
1730  break;
1731 
1732  LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1733  "layout successor until the CFG reduces\n");
1734  }
1735 
1736  // Placement may have changed tail duplication opportunities.
1737  // Check for that now.
1738  if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1739  // If the chosen successor was duplicated into all its predecessors,
1740  // don't bother laying it out, just go round the loop again with BB as
1741  // the chain end.
1742  if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1743  BlockFilter, PrevUnplacedBlockIt))
1744  continue;
1745  }
1746 
1747  // Place this block, updating the datastructures to reflect its placement.
1748  BlockChain &SuccChain = *BlockToChain[BestSucc];
1749  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1750  // we selected a successor that didn't fit naturally into the CFG.
1751  SuccChain.UnscheduledPredecessors = 0;
1752  LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1753  << getBlockName(BestSucc) << "\n");
1754  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1755  Chain.merge(BestSucc, &SuccChain);
1756  BB = *std::prev(Chain.end());
1757  }
1758 
1759  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1760  << getBlockName(*Chain.begin()) << "\n");
1761 }
1762 
1763 // If bottom of block BB has only one successor OldTop, in most cases it is
1764 // profitable to move it before OldTop, except the following case:
1765 //
1766 // -->OldTop<-
1767 // | . |
1768 // | . |
1769 // | . |
1770 // ---Pred |
1771 // | |
1772 // BB-----
1773 //
1774 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1775 // layout the other successor below it, so it can't reduce taken branch.
1776 // In this case we keep its original layout.
1777 bool
1778 MachineBlockPlacement::canMoveBottomBlockToTop(
1779  const MachineBasicBlock *BottomBlock,
1780  const MachineBasicBlock *OldTop) {
1781  if (BottomBlock->pred_size() != 1)
1782  return true;
1783  MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1784  if (Pred->succ_size() != 2)
1785  return true;
1786 
1787  MachineBasicBlock *OtherBB = *Pred->succ_begin();
1788  if (OtherBB == BottomBlock)
1789  OtherBB = *Pred->succ_rbegin();
1790  if (OtherBB == OldTop)
1791  return false;
1792 
1793  return true;
1794 }
1795 
1796 /// Find the best loop top block for layout.
1797 ///
1798 /// Look for a block which is strictly better than the loop header for laying
1799 /// out at the top of the loop. This looks for one and only one pattern:
1800 /// a latch block with no conditional exit. This block will cause a conditional
1801 /// jump around it or will be the bottom of the loop if we lay it out in place,
1802 /// but if it it doesn't end up at the bottom of the loop for any reason,
1803 /// rotation alone won't fix it. Because such a block will always result in an
1804 /// unconditional jump (for the backedge) rotating it in front of the loop
1805 /// header is always profitable.
1807 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
1808  const BlockFilterSet &LoopBlockSet) {
1809  // Placing the latch block before the header may introduce an extra branch
1810  // that skips this block the first time the loop is executed, which we want
1811  // to avoid when optimising for size.
1812  // FIXME: in theory there is a case that does not introduce a new branch,
1813  // i.e. when the layout predecessor does not fallthrough to the loop header.
1814  // In practice this never happens though: there always seems to be a preheader
1815  // that can fallthrough and that is also placed before the header.
1816  if (F->getFunction().optForSize())
1817  return L.getHeader();
1818 
1819  // Check that the header hasn't been fused with a preheader block due to
1820  // crazy branches. If it has, we need to start with the header at the top to
1821  // prevent pulling the preheader into the loop body.
1822  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
1823  if (!LoopBlockSet.count(*HeaderChain.begin()))
1824  return L.getHeader();
1825 
1826  LLVM_DEBUG(dbgs() << "Finding best loop top for: "
1827  << getBlockName(L.getHeader()) << "\n");
1828 
1829  BlockFrequency BestPredFreq;
1830  MachineBasicBlock *BestPred = nullptr;
1831  for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) {
1832  if (!LoopBlockSet.count(Pred))
1833  continue;
1834  LLVM_DEBUG(dbgs() << " header pred: " << getBlockName(Pred) << ", has "
1835  << Pred->succ_size() << " successors, ";
1836  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
1837  if (Pred->succ_size() > 1)
1838  continue;
1839 
1840  if (!canMoveBottomBlockToTop(Pred, L.getHeader()))
1841  continue;
1842 
1843  BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
1844  if (!BestPred || PredFreq > BestPredFreq ||
1845  (!(PredFreq < BestPredFreq) &&
1846  Pred->isLayoutSuccessor(L.getHeader()))) {
1847  BestPred = Pred;
1848  BestPredFreq = PredFreq;
1849  }
1850  }
1851 
1852  // If no direct predecessor is fine, just use the loop header.
1853  if (!BestPred) {
1854  LLVM_DEBUG(dbgs() << " final top unchanged\n");
1855  return L.getHeader();
1856  }
1857 
1858  // Walk backwards through any straight line of predecessors.
1859  while (BestPred->pred_size() == 1 &&
1860  (*BestPred->pred_begin())->succ_size() == 1 &&
1861  *BestPred->pred_begin() != L.getHeader())
1862  BestPred = *BestPred->pred_begin();
1863 
1864  LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
1865  return BestPred;
1866 }
1867 
1868 /// Find the best loop exiting block for layout.
1869 ///
1870 /// This routine implements the logic to analyze the loop looking for the best
1871 /// block to layout at the top of the loop. Typically this is done to maximize
1872 /// fallthrough opportunities.
1874 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
1875  const BlockFilterSet &LoopBlockSet) {
1876  // We don't want to layout the loop linearly in all cases. If the loop header
1877  // is just a normal basic block in the loop, we want to look for what block
1878  // within the loop is the best one to layout at the top. However, if the loop
1879  // header has be pre-merged into a chain due to predecessors not having
1880  // analyzable branches, *and* the predecessor it is merged with is *not* part
1881  // of the loop, rotating the header into the middle of the loop will create
1882  // a non-contiguous range of blocks which is Very Bad. So start with the
1883  // header and only rotate if safe.
1884  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
1885  if (!LoopBlockSet.count(*HeaderChain.begin()))
1886  return nullptr;
1887 
1888  BlockFrequency BestExitEdgeFreq;
1889  unsigned BestExitLoopDepth = 0;
1890  MachineBasicBlock *ExitingBB = nullptr;
1891  // If there are exits to outer loops, loop rotation can severely limit
1892  // fallthrough opportunities unless it selects such an exit. Keep a set of
1893  // blocks where rotating to exit with that block will reach an outer loop.
1894  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
1895 
1896  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
1897  << getBlockName(L.getHeader()) << "\n");
1898  for (MachineBasicBlock *MBB : L.getBlocks()) {
1899  BlockChain &Chain = *BlockToChain[MBB];
1900  // Ensure that this block is at the end of a chain; otherwise it could be
1901  // mid-way through an inner loop or a successor of an unanalyzable branch.
1902  if (MBB != *std::prev(Chain.end()))
1903  continue;
1904 
1905  // Now walk the successors. We need to establish whether this has a viable
1906  // exiting successor and whether it has a viable non-exiting successor.
1907  // We store the old exiting state and restore it if a viable looping
1908  // successor isn't found.
1909  MachineBasicBlock *OldExitingBB = ExitingBB;
1910  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
1911  bool HasLoopingSucc = false;
1912  for (MachineBasicBlock *Succ : MBB->successors()) {
1913  if (Succ->isEHPad())
1914  continue;
1915  if (Succ == MBB)
1916  continue;
1917  BlockChain &SuccChain = *BlockToChain[Succ];
1918  // Don't split chains, either this chain or the successor's chain.
1919  if (&Chain == &SuccChain) {
1920  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
1921  << getBlockName(Succ) << " (chain conflict)\n");
1922  continue;
1923  }
1924 
1925  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
1926  if (LoopBlockSet.count(Succ)) {
1927  LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
1928  << getBlockName(Succ) << " (" << SuccProb << ")\n");
1929  HasLoopingSucc = true;
1930  continue;
1931  }
1932 
1933  unsigned SuccLoopDepth = 0;
1934  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
1935  SuccLoopDepth = ExitLoop->getLoopDepth();
1936  if (ExitLoop->contains(&L))
1937  BlocksExitingToOuterLoop.insert(MBB);
1938  }
1939 
1940  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
1941  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
1942  << getBlockName(Succ) << " [L:" << SuccLoopDepth
1943  << "] (";
1944  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
1945  // Note that we bias this toward an existing layout successor to retain
1946  // incoming order in the absence of better information. The exit must have
1947  // a frequency higher than the current exit before we consider breaking
1948  // the layout.
1949  BranchProbability Bias(100 - ExitBlockBias, 100);
1950  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
1951  ExitEdgeFreq > BestExitEdgeFreq ||
1952  (MBB->isLayoutSuccessor(Succ) &&
1953  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
1954  BestExitEdgeFreq = ExitEdgeFreq;
1955  ExitingBB = MBB;
1956  }
1957  }
1958 
1959  if (!HasLoopingSucc) {
1960  // Restore the old exiting state, no viable looping successor was found.
1961  ExitingBB = OldExitingBB;
1962  BestExitEdgeFreq = OldBestExitEdgeFreq;
1963  }
1964  }
1965  // Without a candidate exiting block or with only a single block in the
1966  // loop, just use the loop header to layout the loop.
1967  if (!ExitingBB) {
1968  LLVM_DEBUG(
1969  dbgs() << " No other candidate exit blocks, using loop header\n");
1970  return nullptr;
1971  }
1972  if (L.getNumBlocks() == 1) {
1973  LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
1974  return nullptr;
1975  }
1976 
1977  // Also, if we have exit blocks which lead to outer loops but didn't select
1978  // one of them as the exiting block we are rotating toward, disable loop
1979  // rotation altogether.
1980  if (!BlocksExitingToOuterLoop.empty() &&
1981  !BlocksExitingToOuterLoop.count(ExitingBB))
1982  return nullptr;
1983 
1984  LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
1985  << "\n");
1986  return ExitingBB;
1987 }
1988 
1989 /// Check if there is a fallthrough to loop header Top.
1990 ///
1991 /// 1. Look for a Pred that can be layout before Top.
1992 /// 2. Check if Top is the most possible successor of Pred.
1993 bool
1994 MachineBlockPlacement::hasViableTopFallthrough(
1995  const MachineBasicBlock *Top,
1996  const BlockFilterSet &LoopBlockSet) {
1997  for (MachineBasicBlock *Pred : Top->predecessors()) {
1998  BlockChain *PredChain = BlockToChain[Pred];
1999  if (!LoopBlockSet.count(Pred) &&
2000  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2001  // Found a Pred block can be placed before Top.
2002  // Check if Top is the best successor of Pred.
2003  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2004  bool TopOK = true;
2005  for (MachineBasicBlock *Succ : Pred->successors()) {
2006  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2007  BlockChain *SuccChain = BlockToChain[Succ];
2008  // Check if Succ can be placed after Pred.
2009  // Succ should not be in any chain, or it is the head of some chain.
2010  if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2011  TopOK = false;
2012  break;
2013  }
2014  }
2015  if (TopOK)
2016  return true;
2017  }
2018  }
2019  return false;
2020 }
2021 
2022 /// Attempt to rotate an exiting block to the bottom of the loop.
2023 ///
2024 /// Once we have built a chain, try to rotate it to line up the hot exit block
2025 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2026 /// branches. For example, if the loop has fallthrough into its header and out
2027 /// of its bottom already, don't rotate it.
2028 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2029  const MachineBasicBlock *ExitingBB,
2030  const BlockFilterSet &LoopBlockSet) {
2031  if (!ExitingBB)
2032  return;
2033 
2034  MachineBasicBlock *Top = *LoopChain.begin();
2035  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2036 
2037  // If ExitingBB is already the last one in a chain then nothing to do.
2038  if (Bottom == ExitingBB)
2039  return;
2040 
2041  bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2042 
2043  // If the header has viable fallthrough, check whether the current loop
2044  // bottom is a viable exiting block. If so, bail out as rotating will
2045  // introduce an unnecessary branch.
2046  if (ViableTopFallthrough) {
2047  for (MachineBasicBlock *Succ : Bottom->successors()) {
2048  BlockChain *SuccChain = BlockToChain[Succ];
2049  if (!LoopBlockSet.count(Succ) &&
2050  (!SuccChain || Succ == *SuccChain->begin()))
2051  return;
2052  }
2053  }
2054 
2055  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2056  if (ExitIt == LoopChain.end())
2057  return;
2058 
2059  // Rotating a loop exit to the bottom when there is a fallthrough to top
2060  // trades the entry fallthrough for an exit fallthrough.
2061  // If there is no bottom->top edge, but the chosen exit block does have
2062  // a fallthrough, we break that fallthrough for nothing in return.
2063 
2064  // Let's consider an example. We have a built chain of basic blocks
2065  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2066  // By doing a rotation we get
2067  // Bk+1, ..., Bn, B1, ..., Bk
2068  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2069  // If we had a fallthrough Bk -> Bk+1 it is broken now.
2070  // It might be compensated by fallthrough Bn -> B1.
2071  // So we have a condition to avoid creation of extra branch by loop rotation.
2072  // All below must be true to avoid loop rotation:
2073  // If there is a fallthrough to top (B1)
2074  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2075  // There is no fallthrough from bottom (Bn) to top (B1).
2076  // Please note that there is no exit fallthrough from Bn because we checked it
2077  // above.
2078  if (ViableTopFallthrough) {
2079  assert(std::next(ExitIt) != LoopChain.end() &&
2080  "Exit should not be last BB");
2081  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2082  if (ExitingBB->isSuccessor(NextBlockInChain))
2083  if (!Bottom->isSuccessor(Top))
2084  return;
2085  }
2086 
2087  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2088  << " at bottom\n");
2089  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2090 }
2091 
2092 /// Attempt to rotate a loop based on profile data to reduce branch cost.
2093 ///
2094 /// With profile data, we can determine the cost in terms of missed fall through
2095 /// opportunities when rotating a loop chain and select the best rotation.
2096 /// Basically, there are three kinds of cost to consider for each rotation:
2097 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2098 /// the loop to the loop header.
2099 /// 2. The possibly missed fall through edges (if they exist) from the loop
2100 /// exits to BB out of the loop.
2101 /// 3. The missed fall through edge (if it exists) from the last BB to the
2102 /// first BB in the loop chain.
2103 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2104 /// We select the best rotation with the smallest cost.
2105 void MachineBlockPlacement::rotateLoopWithProfile(
2106  BlockChain &LoopChain, const MachineLoop &L,
2107  const BlockFilterSet &LoopBlockSet) {
2108  auto HeaderBB = L.getHeader();
2109  auto HeaderIter = llvm::find(LoopChain, HeaderBB);
2110  auto RotationPos = LoopChain.end();
2111 
2112  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2113 
2114  // A utility lambda that scales up a block frequency by dividing it by a
2115  // branch probability which is the reciprocal of the scale.
2116  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2117  unsigned Scale) -> BlockFrequency {
2118  if (Scale == 0)
2119  return 0;
2120  // Use operator / between BlockFrequency and BranchProbability to implement
2121  // saturating multiplication.
2122  return Freq / BranchProbability(1, Scale);
2123  };
2124 
2125  // Compute the cost of the missed fall-through edge to the loop header if the
2126  // chain head is not the loop header. As we only consider natural loops with
2127  // single header, this computation can be done only once.
2128  BlockFrequency HeaderFallThroughCost(0);
2129  for (auto *Pred : HeaderBB->predecessors()) {
2130  BlockChain *PredChain = BlockToChain[Pred];
2131  if (!LoopBlockSet.count(Pred) &&
2132  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2133  auto EdgeFreq =
2134  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB);
2135  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2136  // If the predecessor has only an unconditional jump to the header, we
2137  // need to consider the cost of this jump.
2138  if (Pred->succ_size() == 1)
2139  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2140  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2141  }
2142  }
2143 
2144  // Here we collect all exit blocks in the loop, and for each exit we find out
2145  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2146  // as the sum of frequencies of exit edges we collect here, excluding the exit
2147  // edge from the tail of the loop chain.
2149  for (auto BB : LoopChain) {
2150  auto LargestExitEdgeProb = BranchProbability::getZero();
2151  for (auto *Succ : BB->successors()) {
2152  BlockChain *SuccChain = BlockToChain[Succ];
2153  if (!LoopBlockSet.count(Succ) &&
2154  (!SuccChain || Succ == *SuccChain->begin())) {
2155  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2156  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2157  }
2158  }
2159  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2160  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2161  ExitsWithFreq.emplace_back(BB, ExitFreq);
2162  }
2163  }
2164 
2165  // In this loop we iterate every block in the loop chain and calculate the
2166  // cost assuming the block is the head of the loop chain. When the loop ends,
2167  // we should have found the best candidate as the loop chain's head.
2168  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2169  EndIter = LoopChain.end();
2170  Iter != EndIter; Iter++, TailIter++) {
2171  // TailIter is used to track the tail of the loop chain if the block we are
2172  // checking (pointed by Iter) is the head of the chain.
2173  if (TailIter == LoopChain.end())
2174  TailIter = LoopChain.begin();
2175 
2176  auto TailBB = *TailIter;
2177 
2178  // Calculate the cost by putting this BB to the top.
2179  BlockFrequency Cost = 0;
2180 
2181  // If the current BB is the loop header, we need to take into account the
2182  // cost of the missed fall through edge from outside of the loop to the
2183  // header.
2184  if (Iter != HeaderIter)
2185  Cost += HeaderFallThroughCost;
2186 
2187  // Collect the loop exit cost by summing up frequencies of all exit edges
2188  // except the one from the chain tail.
2189  for (auto &ExitWithFreq : ExitsWithFreq)
2190  if (TailBB != ExitWithFreq.first)
2191  Cost += ExitWithFreq.second;
2192 
2193  // The cost of breaking the once fall-through edge from the tail to the top
2194  // of the loop chain. Here we need to consider three cases:
2195  // 1. If the tail node has only one successor, then we will get an
2196  // additional jmp instruction. So the cost here is (MisfetchCost +
2197  // JumpInstCost) * tail node frequency.
2198  // 2. If the tail node has two successors, then we may still get an
2199  // additional jmp instruction if the layout successor after the loop
2200  // chain is not its CFG successor. Note that the more frequently executed
2201  // jmp instruction will be put ahead of the other one. Assume the
2202  // frequency of those two branches are x and y, where x is the frequency
2203  // of the edge to the chain head, then the cost will be
2204  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2205  // 3. If the tail node has more than two successors (this rarely happens),
2206  // we won't consider any additional cost.
2207  if (TailBB->isSuccessor(*Iter)) {
2208  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2209  if (TailBB->succ_size() == 1)
2210  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2212  else if (TailBB->succ_size() == 2) {
2213  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2214  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2215  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2216  ? TailBBFreq * TailToHeadProb.getCompl()
2217  : TailToHeadFreq;
2218  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2219  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2220  }
2221  }
2222 
2223  LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2224  << getBlockName(*Iter)
2225  << " to the top: " << Cost.getFrequency() << "\n");
2226 
2227  if (Cost < SmallestRotationCost) {
2228  SmallestRotationCost = Cost;
2229  RotationPos = Iter;
2230  }
2231  }
2232 
2233  if (RotationPos != LoopChain.end()) {
2234  LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2235  << " to the top\n");
2236  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2237  }
2238 }
2239 
2240 /// Collect blocks in the given loop that are to be placed.
2241 ///
2242 /// When profile data is available, exclude cold blocks from the returned set;
2243 /// otherwise, collect all blocks in the loop.
2245 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2246  BlockFilterSet LoopBlockSet;
2247 
2248  // Filter cold blocks off from LoopBlockSet when profile data is available.
2249  // Collect the sum of frequencies of incoming edges to the loop header from
2250  // outside. If we treat the loop as a super block, this is the frequency of
2251  // the loop. Then for each block in the loop, we calculate the ratio between
2252  // its frequency and the frequency of the loop block. When it is too small,
2253  // don't add it to the loop chain. If there are outer loops, then this block
2254  // will be merged into the first outer loop chain for which this block is not
2255  // cold anymore. This needs precise profile data and we only do this when
2256  // profile data is available.
2258  BlockFrequency LoopFreq(0);
2259  for (auto LoopPred : L.getHeader()->predecessors())
2260  if (!L.contains(LoopPred))
2261  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2262  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2263 
2264  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2265  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2266  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2267  continue;
2268  LoopBlockSet.insert(LoopBB);
2269  }
2270  } else
2271  LoopBlockSet.insert(L.block_begin(), L.block_end());
2272 
2273  return LoopBlockSet;
2274 }
2275 
2276 /// Forms basic block chains from the natural loop structures.
2277 ///
2278 /// These chains are designed to preserve the existing *structure* of the code
2279 /// as much as possible. We can then stitch the chains together in a way which
2280 /// both preserves the topological structure and minimizes taken conditional
2281 /// branches.
2282 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2283  // First recurse through any nested loops, building chains for those inner
2284  // loops.
2285  for (const MachineLoop *InnerLoop : L)
2286  buildLoopChains(*InnerLoop);
2287 
2288  assert(BlockWorkList.empty() &&
2289  "BlockWorkList not empty when starting to build loop chains.");
2290  assert(EHPadWorkList.empty() &&
2291  "EHPadWorkList not empty when starting to build loop chains.");
2292  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2293 
2294  // Check if we have profile data for this function. If yes, we will rotate
2295  // this loop by modeling costs more precisely which requires the profile data
2296  // for better layout.
2297  bool RotateLoopWithProfile =
2300 
2301  // First check to see if there is an obviously preferable top block for the
2302  // loop. This will default to the header, but may end up as one of the
2303  // predecessors to the header if there is one which will result in strictly
2304  // fewer branches in the loop body.
2305  // When we use profile data to rotate the loop, this is unnecessary.
2306  MachineBasicBlock *LoopTop =
2307  RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet);
2308 
2309  // If we selected just the header for the loop top, look for a potentially
2310  // profitable exit block in the event that rotating the loop can eliminate
2311  // branches by placing an exit edge at the bottom.
2312  //
2313  // Loops are processed innermost to uttermost, make sure we clear
2314  // PreferredLoopExit before processing a new loop.
2315  PreferredLoopExit = nullptr;
2316  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2317  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet);
2318 
2319  BlockChain &LoopChain = *BlockToChain[LoopTop];
2320 
2321  // FIXME: This is a really lame way of walking the chains in the loop: we
2322  // walk the blocks, and use a set to prevent visiting a particular chain
2323  // twice.
2324  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2325  assert(LoopChain.UnscheduledPredecessors == 0 &&
2326  "LoopChain should not have unscheduled predecessors.");
2327  UpdatedPreds.insert(&LoopChain);
2328 
2329  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2330  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2331 
2332  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2333 
2334  if (RotateLoopWithProfile)
2335  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2336  else
2337  rotateLoop(LoopChain, PreferredLoopExit, LoopBlockSet);
2338 
2339  LLVM_DEBUG({
2340  // Crash at the end so we get all of the debugging output first.
2341  bool BadLoop = false;
2342  if (LoopChain.UnscheduledPredecessors) {
2343  BadLoop = true;
2344  dbgs() << "Loop chain contains a block without its preds placed!\n"
2345  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2346  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2347  }
2348  for (MachineBasicBlock *ChainBB : LoopChain) {
2349  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2350  if (!LoopBlockSet.remove(ChainBB)) {
2351  // We don't mark the loop as bad here because there are real situations
2352  // where this can occur. For example, with an unanalyzable fallthrough
2353  // from a loop block to a non-loop block or vice versa.
2354  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2355  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2356  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2357  << " Bad block: " << getBlockName(ChainBB) << "\n";
2358  }
2359  }
2360 
2361  if (!LoopBlockSet.empty()) {
2362  BadLoop = true;
2363  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2364  dbgs() << "Loop contains blocks never placed into a chain!\n"
2365  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2366  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2367  << " Bad block: " << getBlockName(LoopBB) << "\n";
2368  }
2369  assert(!BadLoop && "Detected problems with the placement of this loop.");
2370  });
2371 
2372  BlockWorkList.clear();
2373  EHPadWorkList.clear();
2374 }
2375 
2376 void MachineBlockPlacement::buildCFGChains() {
2377  // Ensure that every BB in the function has an associated chain to simplify
2378  // the assumptions of the remaining algorithm.
2379  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2380  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2381  ++FI) {
2382  MachineBasicBlock *BB = &*FI;
2383  BlockChain *Chain =
2384  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2385  // Also, merge any blocks which we cannot reason about and must preserve
2386  // the exact fallthrough behavior for.
2387  while (true) {
2388  Cond.clear();
2389  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2390  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2391  break;
2392 
2393  MachineFunction::iterator NextFI = std::next(FI);
2394  MachineBasicBlock *NextBB = &*NextFI;
2395  // Ensure that the layout successor is a viable block, as we know that
2396  // fallthrough is a possibility.
2397  assert(NextFI != FE && "Can't fallthrough past the last block.");
2398  LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2399  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2400  << "\n");
2401  Chain->merge(NextBB, nullptr);
2402 #ifndef NDEBUG
2403  BlocksWithUnanalyzableExits.insert(&*BB);
2404 #endif
2405  FI = NextFI;
2406  BB = NextBB;
2407  }
2408  }
2409 
2410  // Build any loop-based chains.
2411  PreferredLoopExit = nullptr;
2412  for (MachineLoop *L : *MLI)
2413  buildLoopChains(*L);
2414 
2415  assert(BlockWorkList.empty() &&
2416  "BlockWorkList should be empty before building final chain.");
2417  assert(EHPadWorkList.empty() &&
2418  "EHPadWorkList should be empty before building final chain.");
2419 
2420  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2421  for (MachineBasicBlock &MBB : *F)
2422  fillWorkLists(&MBB, UpdatedPreds);
2423 
2424  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2425  buildChain(&F->front(), FunctionChain);
2426 
2427 #ifndef NDEBUG
2428  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2429 #endif
2430  LLVM_DEBUG({
2431  // Crash at the end so we get all of the debugging output first.
2432  bool BadFunc = false;
2433  FunctionBlockSetType FunctionBlockSet;
2434  for (MachineBasicBlock &MBB : *F)
2435  FunctionBlockSet.insert(&MBB);
2436 
2437  for (MachineBasicBlock *ChainBB : FunctionChain)
2438  if (!FunctionBlockSet.erase(ChainBB)) {
2439  BadFunc = true;
2440  dbgs() << "Function chain contains a block not in the function!\n"
2441  << " Bad block: " << getBlockName(ChainBB) << "\n";
2442  }
2443 
2444  if (!FunctionBlockSet.empty()) {
2445  BadFunc = true;
2446  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2447  dbgs() << "Function contains blocks never placed into a chain!\n"
2448  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2449  }
2450  assert(!BadFunc && "Detected problems with the block placement.");
2451  });
2452 
2453  // Splice the blocks into place.
2454  MachineFunction::iterator InsertPos = F->begin();
2455  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2456  for (MachineBasicBlock *ChainBB : FunctionChain) {
2457  LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2458  : " ... ")
2459  << getBlockName(ChainBB) << "\n");
2460  if (InsertPos != MachineFunction::iterator(ChainBB))
2461  F->splice(InsertPos, ChainBB);
2462  else
2463  ++InsertPos;
2464 
2465  // Update the terminator of the previous block.
2466  if (ChainBB == *FunctionChain.begin())
2467  continue;
2468  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2469 
2470  // FIXME: It would be awesome of updateTerminator would just return rather
2471  // than assert when the branch cannot be analyzed in order to remove this
2472  // boiler plate.
2473  Cond.clear();
2474  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2475 
2476 #ifndef NDEBUG
2477  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2478  // Given the exact block placement we chose, we may actually not _need_ to
2479  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2480  // do that at this point is a bug.
2481  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2482  !PrevBB->canFallThrough()) &&
2483  "Unexpected block with un-analyzable fallthrough!");
2484  Cond.clear();
2485  TBB = FBB = nullptr;
2486  }
2487 #endif
2488 
2489  // The "PrevBB" is not yet updated to reflect current code layout, so,
2490  // o. it may fall-through to a block without explicit "goto" instruction
2491  // before layout, and no longer fall-through it after layout; or
2492  // o. just opposite.
2493  //
2494  // analyzeBranch() may return erroneous value for FBB when these two
2495  // situations take place. For the first scenario FBB is mistakenly set NULL;
2496  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2497  // mistakenly pointing to "*BI".
2498  // Thus, if the future change needs to use FBB before the layout is set, it
2499  // has to correct FBB first by using the code similar to the following:
2500  //
2501  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2502  // PrevBB->updateTerminator();
2503  // Cond.clear();
2504  // TBB = FBB = nullptr;
2505  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2506  // // FIXME: This should never take place.
2507  // TBB = FBB = nullptr;
2508  // }
2509  // }
2510  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
2511  PrevBB->updateTerminator();
2512  }
2513 
2514  // Fixup the last block.
2515  Cond.clear();
2516  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2517  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
2518  F->back().updateTerminator();
2519 
2520  BlockWorkList.clear();
2521  EHPadWorkList.clear();
2522 }
2523 
2524 void MachineBlockPlacement::optimizeBranches() {
2525  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2526  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2527 
2528  // Now that all the basic blocks in the chain have the proper layout,
2529  // make a final call to AnalyzeBranch with AllowModify set.
2530  // Indeed, the target may be able to optimize the branches in a way we
2531  // cannot because all branches may not be analyzable.
2532  // E.g., the target may be able to remove an unconditional branch to
2533  // a fallthrough when it occurs after predicated terminators.
2534  for (MachineBasicBlock *ChainBB : FunctionChain) {
2535  Cond.clear();
2536  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2537  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2538  // If PrevBB has a two-way branch, try to re-order the branches
2539  // such that we branch to the successor with higher probability first.
2540  if (TBB && !Cond.empty() && FBB &&
2541  MBPI->getEdgeProbability(ChainBB, FBB) >
2542  MBPI->getEdgeProbability(ChainBB, TBB) &&
2543  !TII->reverseBranchCondition(Cond)) {
2544  LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2545  << getBlockName(ChainBB) << "\n");
2546  LLVM_DEBUG(dbgs() << " Edge probability: "
2547  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2548  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2549  DebugLoc dl; // FIXME: this is nowhere
2550  TII->removeBranch(*ChainBB);
2551  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2552  ChainBB->updateTerminator();
2553  }
2554  }
2555  }
2556 }
2557 
2558 void MachineBlockPlacement::alignBlocks() {
2559  // Walk through the backedges of the function now that we have fully laid out
2560  // the basic blocks and align the destination of each backedge. We don't rely
2561  // exclusively on the loop info here so that we can align backedges in
2562  // unnatural CFGs and backedges that were introduced purely because of the
2563  // loop rotations done during this layout pass.
2564  if (F->getFunction().optForMinSize() ||
2565  (F->getFunction().optForSize() && !TLI->alignLoopsWithOptSize()))
2566  return;
2567  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2568  if (FunctionChain.begin() == FunctionChain.end())
2569  return; // Empty chain.
2570 
2571  const BranchProbability ColdProb(1, 5); // 20%
2572  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2573  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2574  for (MachineBasicBlock *ChainBB : FunctionChain) {
2575  if (ChainBB == *FunctionChain.begin())
2576  continue;
2577 
2578  // Don't align non-looping basic blocks. These are unlikely to execute
2579  // enough times to matter in practice. Note that we'll still handle
2580  // unnatural CFGs inside of a natural outer loop (the common case) and
2581  // rotated loops.
2582  MachineLoop *L = MLI->getLoopFor(ChainBB);
2583  if (!L)
2584  continue;
2585 
2586  unsigned Align = TLI->getPrefLoopAlignment(L);
2587  if (!Align)
2588  continue; // Don't care about loop alignment.
2589 
2590  // If the block is cold relative to the function entry don't waste space
2591  // aligning it.
2592  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2593  if (Freq < WeightedEntryFreq)
2594  continue;
2595 
2596  // If the block is cold relative to its loop header, don't align it
2597  // regardless of what edges into the block exist.
2598  MachineBasicBlock *LoopHeader = L->getHeader();
2599  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2600  if (Freq < (LoopHeaderFreq * ColdProb))
2601  continue;
2602 
2603  // Check for the existence of a non-layout predecessor which would benefit
2604  // from aligning this block.
2605  MachineBasicBlock *LayoutPred =
2606  &*std::prev(MachineFunction::iterator(ChainBB));
2607 
2608  // Force alignment if all the predecessors are jumps. We already checked
2609  // that the block isn't cold above.
2610  if (!LayoutPred->isSuccessor(ChainBB)) {
2611  ChainBB->setAlignment(Align);
2612  continue;
2613  }
2614 
2615  // Align this block if the layout predecessor's edge into this block is
2616  // cold relative to the block. When this is true, other predecessors make up
2617  // all of the hot entries into the block and thus alignment is likely to be
2618  // important.
2619  BranchProbability LayoutProb =
2620  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2621  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2622  if (LayoutEdgeFreq <= (Freq * ColdProb))
2623  ChainBB->setAlignment(Align);
2624  }
2625 }
2626 
2627 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2628 /// it was duplicated into its chain predecessor and removed.
2629 /// \p BB - Basic block that may be duplicated.
2630 ///
2631 /// \p LPred - Chosen layout predecessor of \p BB.
2632 /// Updated to be the chain end if LPred is removed.
2633 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2634 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2635 /// Used to identify which blocks to update predecessor
2636 /// counts.
2637 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2638 /// chosen in the given order due to unnatural CFG
2639 /// only needed if \p BB is removed and
2640 /// \p PrevUnplacedBlockIt pointed to \p BB.
2641 /// @return true if \p BB was removed.
2642 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2643  MachineBasicBlock *BB, MachineBasicBlock *&LPred,
2644  const MachineBasicBlock *LoopHeaderBB,
2645  BlockChain &Chain, BlockFilterSet *BlockFilter,
2646  MachineFunction::iterator &PrevUnplacedBlockIt) {
2647  bool Removed, DuplicatedToLPred;
2648  bool DuplicatedToOriginalLPred;
2649  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
2650  PrevUnplacedBlockIt,
2651  DuplicatedToLPred);
2652  if (!Removed)
2653  return false;
2654  DuplicatedToOriginalLPred = DuplicatedToLPred;
2655  // Iteratively try to duplicate again. It can happen that a block that is
2656  // duplicated into is still small enough to be duplicated again.
2657  // No need to call markBlockSuccessors in this case, as the blocks being
2658  // duplicated from here on are already scheduled.
2659  // Note that DuplicatedToLPred always implies Removed.
2660  while (DuplicatedToLPred) {
2661  assert(Removed && "Block must have been removed to be duplicated into its "
2662  "layout predecessor.");
2663  MachineBasicBlock *DupBB, *DupPred;
2664  // The removal callback causes Chain.end() to be updated when a block is
2665  // removed. On the first pass through the loop, the chain end should be the
2666  // same as it was on function entry. On subsequent passes, because we are
2667  // duplicating the block at the end of the chain, if it is removed the
2668  // chain will have shrunk by one block.
2669  BlockChain::iterator ChainEnd = Chain.end();
2670  DupBB = *(--ChainEnd);
2671  // Now try to duplicate again.
2672  if (ChainEnd == Chain.begin())
2673  break;
2674  DupPred = *std::prev(ChainEnd);
2675  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
2676  PrevUnplacedBlockIt,
2677  DuplicatedToLPred);
2678  }
2679  // If BB was duplicated into LPred, it is now scheduled. But because it was
2680  // removed, markChainSuccessors won't be called for its chain. Instead we
2681  // call markBlockSuccessors for LPred to achieve the same effect. This must go
2682  // at the end because repeating the tail duplication can increase the number
2683  // of unscheduled predecessors.
2684  LPred = *std::prev(Chain.end());
2685  if (DuplicatedToOriginalLPred)
2686  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
2687  return true;
2688 }
2689 
2690 /// Tail duplicate \p BB into (some) predecessors if profitable.
2691 /// \p BB - Basic block that may be duplicated
2692 /// \p LPred - Chosen layout predecessor of \p BB
2693 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2694 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2695 /// Used to identify which blocks to update predecessor
2696 /// counts.
2697 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2698 /// chosen in the given order due to unnatural CFG
2699 /// only needed if \p BB is removed and
2700 /// \p PrevUnplacedBlockIt pointed to \p BB.
2701 /// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will
2702 /// only be true if the block was removed.
2703 /// \return - True if the block was duplicated into all preds and removed.
2704 bool MachineBlockPlacement::maybeTailDuplicateBlock(
2706  BlockChain &Chain, BlockFilterSet *BlockFilter,
2707  MachineFunction::iterator &PrevUnplacedBlockIt,
2708  bool &DuplicatedToLPred) {
2709  DuplicatedToLPred = false;
2710  if (!shouldTailDuplicate(BB))
2711  return false;
2712 
2713  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
2714  << "\n");
2715 
2716  // This has to be a callback because none of it can be done after
2717  // BB is deleted.
2718  bool Removed = false;
2719  auto RemovalCallback =
2720  [&](MachineBasicBlock *RemBB) {
2721  // Signal to outer function
2722  Removed = true;
2723 
2724  // Conservative default.
2725  bool InWorkList = true;
2726  // Remove from the Chain and Chain Map
2727  if (BlockToChain.count(RemBB)) {
2728  BlockChain *Chain = BlockToChain[RemBB];
2729  InWorkList = Chain->UnscheduledPredecessors == 0;
2730  Chain->remove(RemBB);
2731  BlockToChain.erase(RemBB);
2732  }
2733 
2734  // Handle the unplaced block iterator
2735  if (&(*PrevUnplacedBlockIt) == RemBB) {
2736  PrevUnplacedBlockIt++;
2737  }
2738 
2739  // Handle the Work Lists
2740  if (InWorkList) {
2741  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
2742  if (RemBB->isEHPad())
2743  RemoveList = EHPadWorkList;
2744  RemoveList.erase(
2745  llvm::remove_if(RemoveList,
2746  [RemBB](MachineBasicBlock *BB) {
2747  return BB == RemBB;
2748  }),
2749  RemoveList.end());
2750  }
2751 
2752  // Handle the filter set
2753  if (BlockFilter) {
2754  BlockFilter->remove(RemBB);
2755  }
2756 
2757  // Remove the block from loop info.
2758  MLI->removeBlock(RemBB);
2759  if (RemBB == PreferredLoopExit)
2760  PreferredLoopExit = nullptr;
2761 
2762  LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
2763  << getBlockName(RemBB) << "\n");
2764  };
2765  auto RemovalCallbackRef =
2766  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
2767 
2768  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
2769  bool IsSimple = TailDup.isSimpleBB(BB);
2770  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
2771  &DuplicatedPreds, &RemovalCallbackRef);
2772 
2773  // Update UnscheduledPredecessors to reflect tail-duplication.
2774  DuplicatedToLPred = false;
2775  for (MachineBasicBlock *Pred : DuplicatedPreds) {
2776  // We're only looking for unscheduled predecessors that match the filter.
2777  BlockChain* PredChain = BlockToChain[Pred];
2778  if (Pred == LPred)
2779  DuplicatedToLPred = true;
2780  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
2781  || PredChain == &Chain)
2782  continue;
2783  for (MachineBasicBlock *NewSucc : Pred->successors()) {
2784  if (BlockFilter && !BlockFilter->count(NewSucc))
2785  continue;
2786  BlockChain *NewChain = BlockToChain[NewSucc];
2787  if (NewChain != &Chain && NewChain != PredChain)
2788  NewChain->UnscheduledPredecessors++;
2789  }
2790  }
2791  return Removed;
2792 }
2793 
2794 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
2795  if (skipFunction(MF.getFunction()))
2796  return false;
2797 
2798  // Check for single-block functions and skip them.
2799  if (std::next(MF.begin()) == MF.end())
2800  return false;
2801 
2802  F = &MF;
2803  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2804  MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
2805  getAnalysis<MachineBlockFrequencyInfo>());
2806  MLI = &getAnalysis<MachineLoopInfo>();
2807  TII = MF.getSubtarget().getInstrInfo();
2808  TLI = MF.getSubtarget().getTargetLowering();
2809  MPDT = nullptr;
2810 
2811  // Initialize PreferredLoopExit to nullptr here since it may never be set if
2812  // there are no MachineLoops.
2813  PreferredLoopExit = nullptr;
2814 
2815  assert(BlockToChain.empty() &&
2816  "BlockToChain map should be empty before starting placement.");
2817  assert(ComputedEdges.empty() &&
2818  "Computed Edge map should be empty before starting placement.");
2819 
2820  unsigned TailDupSize = TailDupPlacementThreshold;
2821  // If only the aggressive threshold is explicitly set, use it.
2822  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
2823  TailDupPlacementThreshold.getNumOccurrences() == 0)
2825 
2826  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
2827  // For aggressive optimization, we can adjust some thresholds to be less
2828  // conservative.
2829  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
2830  // At O3 we should be more willing to copy blocks for tail duplication. This
2831  // increases size pressure, so we only do it at O3
2832  // Do this unless only the regular threshold is explicitly set.
2833  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
2834  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
2836  }
2837 
2838  if (allowTailDupPlacement()) {
2839  MPDT = &getAnalysis<MachinePostDominatorTree>();
2840  if (MF.getFunction().optForSize())
2841  TailDupSize = 1;
2842  bool PreRegAlloc = false;
2843  TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize);
2844  precomputeTriangleChains();
2845  }
2846 
2847  buildCFGChains();
2848 
2849  // Changing the layout can create new tail merging opportunities.
2850  // TailMerge can create jump into if branches that make CFG irreducible for
2851  // HW that requires structured CFG.
2852  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
2853  PassConfig->getEnableTailMerge() &&
2855  // No tail merging opportunities if the block number is less than four.
2856  if (MF.size() > 3 && EnableTailMerge) {
2857  unsigned TailMergeSize = TailDupSize + 1;
2858  BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
2859  *MBPI, TailMergeSize);
2860 
2861  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
2862  getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
2863  /*AfterBlockPlacement=*/true)) {
2864  // Redo the layout if tail merging creates/removes/moves blocks.
2865  BlockToChain.clear();
2866  ComputedEdges.clear();
2867  // Must redo the post-dominator tree if blocks were changed.
2868  if (MPDT)
2869  MPDT->runOnMachineFunction(MF);
2870  ChainAllocator.DestroyAll();
2871  buildCFGChains();
2872  }
2873  }
2874 
2875  optimizeBranches();
2876  alignBlocks();
2877 
2878  BlockToChain.clear();
2879  ComputedEdges.clear();
2880  ChainAllocator.DestroyAll();
2881 
2882  if (AlignAllBlock)
2883  // Align all of the blocks in the function to a specific alignment.
2884  for (MachineBasicBlock &MBB : MF)
2886  else if (AlignAllNonFallThruBlocks) {
2887  // Align all of the blocks that have no fall-through predecessors to a
2888  // specific alignment.
2889  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
2890  auto LayoutPred = std::prev(MBI);
2891  if (!LayoutPred->isSuccessor(&*MBI))
2892  MBI->setAlignment(AlignAllNonFallThruBlocks);
2893  }
2894  }
2895  if (ViewBlockLayoutWithBFI != GVDT_None &&
2896  (ViewBlockFreqFuncName.empty() ||
2897  F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
2898  MBFI->view("MBP." + MF.getName(), false);
2899  }
2900 
2901 
2902  // We always return true as we have no way to track whether the final order
2903  // differs from the original order.
2904  return true;
2905 }
2906 
2907 namespace {
2908 
2909 /// A pass to compute block placement statistics.
2910 ///
2911 /// A separate pass to compute interesting statistics for evaluating block
2912 /// placement. This is separate from the actual placement pass so that they can
2913 /// be computed in the absence of any placement transformations or when using
2914 /// alternative placement strategies.
2915 class MachineBlockPlacementStats : public MachineFunctionPass {
2916  /// A handle to the branch probability pass.
2917  const MachineBranchProbabilityInfo *MBPI;
2918 
2919  /// A handle to the function-wide block frequency pass.
2920  const MachineBlockFrequencyInfo *MBFI;
2921 
2922 public:
2923  static char ID; // Pass identification, replacement for typeid
2924 
2925  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
2927  }
2928 
2929  bool runOnMachineFunction(MachineFunction &F) override;
2930 
2931  void getAnalysisUsage(AnalysisUsage &AU) const override {
2934  AU.setPreservesAll();
2936  }
2937 };
2938 
2939 } // end anonymous namespace
2940 
2942 
2944 
2945 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
2946  "Basic Block Placement Stats", false, false)
2949 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
2950  "Basic Block Placement Stats", false, false)
2951 
2952 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
2953  // Check for single-block functions and skip them.
2954  if (std::next(F.begin()) == F.end())
2955  return false;
2956 
2957  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2958  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
2959 
2960  for (MachineBasicBlock &MBB : F) {
2961  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
2962  Statistic &NumBranches =
2963  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
2964  Statistic &BranchTakenFreq =
2965  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
2966  for (MachineBasicBlock *Succ : MBB.successors()) {
2967  // Skip if this successor is a fallthrough.
2968  if (MBB.isLayoutSuccessor(Succ))
2969  continue;
2970 
2971  BlockFrequency EdgeFreq =
2972  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
2973  ++NumBranches;
2974  BranchTakenFreq += EdgeFreq.getFrequency();
2975  }
2976  }
2977 
2978  return false;
2979 }
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:645
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
BranchProbability getCompl() const
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:320
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:464
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
unsigned size() const
bool requiresStructuredCFG() const
virtual const TargetLowering * getTargetLowering() const
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const
Insert branch code into the end of the specified MachineBasicBlock.
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:116
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:33
F(f)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static BranchProbability getOne()
virtual unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const
Remove the branching code at the end of the specific MBB.
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void setAlignment(unsigned Align)
Set alignment of the basic block.
CodeGenOpt::Level getOptLevel() const
This file defines the MallocAllocator and BumpPtrAllocator interfaces.
iterator_range< succ_iterator > successors()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:221
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:454
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli=nullptr, bool AfterPlacement=false)
Perhaps branch folding, tail merging and other CFG optimizations on the given function.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
void removeBlock(MachineBasicBlock *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
block placement Basic Block Placement Stats
Target-Independent Code Generator Pass Configuration Options.
BlockT * getHeader() const
Definition: LoopInfo.h:99
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:266
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:462
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
virtual const TargetInstrInfo * getInstrInfo() const
cl::opt< std::string > ViewBlockFreqFuncName
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1251
TargetInstrInfo - Interface to description of machine instruction set.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
cl::opt< unsigned > ProfileLikelyProb
bool getEnableTailMerge() const
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
#define P(N)
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock *> *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:423
cl::opt< GVDAGType > ViewBlockLayoutWithBFI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:290
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
succ_reverse_iterator succ_rbegin()
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
virtual unsigned getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Represent the analysis usage information of a pass.
bool optForSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:597
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency&#39;s with a small penalty for A.
iterator_range< pred_iterator > predecessors()
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:490
auto remove_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1225
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e...
iterator erase(const_iterator CI)
Definition: SmallVector.h:438
const MachineBasicBlock & front() const
size_t size() const
Definition: SmallVector.h:52
void initializeMachineBlockPlacementPass(PassRegistry &)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1206
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Branch Probability Basic Block Placement
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
size_type size() const
Definition: SmallPtrSet.h:92
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:109
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:297
Iterator for intrusive lists based on ilist_node.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
loop rotate
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:841
void updateTerminator()
Update the terminator instructions in block to account for changes to the layout. ...
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:441
unsigned pred_size() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
void setPreservesAll()
Set by analyses that do not transform their input at all.
typename SuperClass::iterator iterator
Definition: SmallVector.h:319
unsigned succ_size() const
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
cl::opt< unsigned > StaticLikelyProb
LLVM_NODISCARD bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:160
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:162
unsigned succ_size(const Instruction *I)
Definition: CFG.h:256
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all " "blocks in the function."), cl::init(0), cl::Hidden)
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:148
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock *> &Successors)
Check if BB has exactly the successors in Successors.
#define I(x, y, z)
Definition: MD5.cpp:58
bool optForMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:594
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
block_iterator block_end() const
Definition: LoopInfo.h:154
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:171
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:211
virtual bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const
Reverses the branch condition of the specified condition list, returning false on success and true if...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:482
#define DEBUG_TYPE
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
static BranchProbability getZero()
Utility class to perform tail duplication.
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
#define LLVM_DEBUG(X)
Definition: Debug.h:122
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
block_iterator block_begin() const
Definition: LoopInfo.h:153
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all " "blocks that have no fall-through predecessors (i.e. don't add " "nops that are executed)."), cl::init(0), cl::Hidden)
bool hasProfileData() const
Return true if the function is annotated with profile data.
Definition: Function.h:307
uint32_t getNumerator() const
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
This file describes how to lower LLVM code to machine code.