LLVM  16.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements basic block placement transformations using the CFG
10 // structure and branch probability estimates.
11 //
12 // The pass strives to preserve the structure of the CFG (that is, retain
13 // a topological ordering of basic blocks) in the absence of a *strong* signal
14 // to the contrary from probabilities. However, within the CFG structure, it
15 // attempts to choose an ordering which favors placing more likely sequences of
16 // blocks adjacent to each other.
17 //
18 // The algorithm works from the inner-most loop within a function outward, and
19 // at each stage walks through the basic blocks, trying to coalesce them into
20 // sequential chains where allowed by the CFG (or demanded by heavy
21 // probabilities). Finally, it walks the blocks in topological order, and the
22 // first time it reaches a chain of basic blocks, it schedules them in the
23 // function in-order.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "BranchFolding.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
51 #include "llvm/IR/DebugLoc.h"
52 #include "llvm/IR/Function.h"
53 #include "llvm/IR/PrintPasses.h"
54 #include "llvm/InitializePasses.h"
55 #include "llvm/Pass.h"
56 #include "llvm/Support/Allocator.h"
59 #include "llvm/Support/CodeGen.h"
61 #include "llvm/Support/Compiler.h"
62 #include "llvm/Support/Debug.h"
66 #include <algorithm>
67 #include <cassert>
68 #include <cstdint>
69 #include <iterator>
70 #include <memory>
71 #include <string>
72 #include <tuple>
73 #include <utility>
74 #include <vector>
75 
76 using namespace llvm;
77 
78 #define DEBUG_TYPE "block-placement"
79 
80 STATISTIC(NumCondBranches, "Number of conditional branches");
81 STATISTIC(NumUncondBranches, "Number of unconditional branches");
82 STATISTIC(CondBranchTakenFreq,
83  "Potential frequency of taking conditional branches");
84 STATISTIC(UncondBranchTakenFreq,
85  "Potential frequency of taking unconditional branches");
86 
88  "align-all-blocks",
89  cl::desc("Force the alignment of all blocks in the function in log2 format "
90  "(e.g 4 means align on 16B boundaries)."),
91  cl::init(0), cl::Hidden);
92 
94  "align-all-nofallthru-blocks",
95  cl::desc("Force the alignment of all blocks that have no fall-through "
96  "predecessors (i.e. don't add nops that are executed). In log2 "
97  "format (e.g 4 means align on 16B boundaries)."),
98  cl::init(0), cl::Hidden);
99 
101  "max-bytes-for-alignment",
102  cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
103  "alignment"),
104  cl::init(0), cl::Hidden);
105 
106 // FIXME: Find a good default for this flag and remove the flag.
108  "block-placement-exit-block-bias",
109  cl::desc("Block frequency percentage a loop exit block needs "
110  "over the original exit to be considered the new exit."),
111  cl::init(0), cl::Hidden);
112 
113 // Definition:
114 // - Outlining: placement of a basic block outside the chain or hot path.
115 
117  "loop-to-cold-block-ratio",
118  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
119  "(frequency of block) is greater than this ratio"),
120  cl::init(5), cl::Hidden);
121 
123  "force-loop-cold-block",
124  cl::desc("Force outlining cold blocks from loops."),
125  cl::init(false), cl::Hidden);
126 
127 static cl::opt<bool>
128  PreciseRotationCost("precise-rotation-cost",
129  cl::desc("Model the cost of loop rotation more "
130  "precisely by using profile data."),
131  cl::init(false), cl::Hidden);
132 
133 static cl::opt<bool>
134  ForcePreciseRotationCost("force-precise-rotation-cost",
135  cl::desc("Force the use of precise cost "
136  "loop rotation strategy."),
137  cl::init(false), cl::Hidden);
138 
140  "misfetch-cost",
141  cl::desc("Cost that models the probabilistic risk of an instruction "
142  "misfetch due to a jump comparing to falling through, whose cost "
143  "is zero."),
144  cl::init(1), cl::Hidden);
145 
146 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
147  cl::desc("Cost of jump instructions."),
148  cl::init(1), cl::Hidden);
149 static cl::opt<bool>
150 TailDupPlacement("tail-dup-placement",
151  cl::desc("Perform tail duplication during placement. "
152  "Creates more fallthrough opportunites in "
153  "outline branches."),
154  cl::init(true), cl::Hidden);
155 
156 static cl::opt<bool>
157 BranchFoldPlacement("branch-fold-placement",
158  cl::desc("Perform branch folding during placement. "
159  "Reduces code size."),
160  cl::init(true), cl::Hidden);
161 
162 // Heuristic for tail duplication.
164  "tail-dup-placement-threshold",
165  cl::desc("Instruction cutoff for tail duplication during layout. "
166  "Tail merging during layout is forced to have a threshold "
167  "that won't conflict."), cl::init(2),
168  cl::Hidden);
169 
170 // Heuristic for aggressive tail duplication.
172  "tail-dup-placement-aggressive-threshold",
173  cl::desc("Instruction cutoff for aggressive tail duplication during "
174  "layout. Used at -O3. Tail merging during layout is forced to "
175  "have a threshold that won't conflict."), cl::init(4),
176  cl::Hidden);
177 
178 // Heuristic for tail duplication.
180  "tail-dup-placement-penalty",
181  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
182  "Copying can increase fallthrough, but it also increases icache "
183  "pressure. This parameter controls the penalty to account for that. "
184  "Percent as integer."),
185  cl::init(2),
186  cl::Hidden);
187 
188 // Heuristic for tail duplication if profile count is used in cost model.
190  "tail-dup-profile-percent-threshold",
191  cl::desc("If profile count information is used in tail duplication cost "
192  "model, the gained fall through number from tail duplication "
193  "should be at least this percent of hot count."),
194  cl::init(50), cl::Hidden);
195 
196 // Heuristic for triangle chains.
198  "triangle-chain-count",
199  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
200  "triangle tail duplication heuristic to kick in. 0 to disable."),
201  cl::init(2),
202  cl::Hidden);
203 
204 // Use case: When block layout is visualized after MBP pass, the basic blocks
205 // are labeled in layout order; meanwhile blocks could be numbered in a
206 // different order. It's hard to map between the graph and pass output.
207 // With this option on, the basic blocks are renumbered in function layout
208 // order. For debugging only.
210  "renumber-blocks-before-view",
211  cl::desc(
212  "If true, basic blocks are re-numbered before MBP layout is printed "
213  "into a dot graph. Only used when a function is being printed."),
214  cl::init(false), cl::Hidden);
215 
218 
219 namespace llvm {
222 
223 // Internal option used to control BFI display only after MBP pass.
224 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
225 // -view-block-layout-with-bfi=
227 
228 // Command line option to specify the name of the function for CFG dump
229 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
231 } // namespace llvm
232 
233 namespace {
234 
235 class BlockChain;
236 
237 /// Type for our function-wide basic block -> block chain mapping.
238 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
239 
240 /// A chain of blocks which will be laid out contiguously.
241 ///
242 /// This is the datastructure representing a chain of consecutive blocks that
243 /// are profitable to layout together in order to maximize fallthrough
244 /// probabilities and code locality. We also can use a block chain to represent
245 /// a sequence of basic blocks which have some external (correctness)
246 /// requirement for sequential layout.
247 ///
248 /// Chains can be built around a single basic block and can be merged to grow
249 /// them. They participate in a block-to-chain mapping, which is updated
250 /// automatically as chains are merged together.
251 class BlockChain {
252  /// The sequence of blocks belonging to this chain.
253  ///
254  /// This is the sequence of blocks for a particular chain. These will be laid
255  /// out in-order within the function.
257 
258  /// A handle to the function-wide basic block to block chain mapping.
259  ///
260  /// This is retained in each block chain to simplify the computation of child
261  /// block chains for SCC-formation and iteration. We store the edges to child
262  /// basic blocks, and map them back to their associated chains using this
263  /// structure.
264  BlockToChainMapType &BlockToChain;
265 
266 public:
267  /// Construct a new BlockChain.
268  ///
269  /// This builds a new block chain representing a single basic block in the
270  /// function. It also registers itself as the chain that block participates
271  /// in with the BlockToChain mapping.
272  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
273  : Blocks(1, BB), BlockToChain(BlockToChain) {
274  assert(BB && "Cannot create a chain with a null basic block");
275  BlockToChain[BB] = this;
276  }
277 
278  /// Iterator over blocks within the chain.
281 
282  /// Beginning of blocks within the chain.
283  iterator begin() { return Blocks.begin(); }
284  const_iterator begin() const { return Blocks.begin(); }
285 
286  /// End of blocks within the chain.
287  iterator end() { return Blocks.end(); }
288  const_iterator end() const { return Blocks.end(); }
289 
290  bool remove(MachineBasicBlock* BB) {
291  for(iterator i = begin(); i != end(); ++i) {
292  if (*i == BB) {
293  Blocks.erase(i);
294  return true;
295  }
296  }
297  return false;
298  }
299 
300  /// Merge a block chain into this one.
301  ///
302  /// This routine merges a block chain into this one. It takes care of forming
303  /// a contiguous sequence of basic blocks, updating the edge list, and
304  /// updating the block -> chain mapping. It does not free or tear down the
305  /// old chain, but the old chain's block list is no longer valid.
306  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
307  assert(BB && "Can't merge a null block.");
308  assert(!Blocks.empty() && "Can't merge into an empty chain.");
309 
310  // Fast path in case we don't have a chain already.
311  if (!Chain) {
312  assert(!BlockToChain[BB] &&
313  "Passed chain is null, but BB has entry in BlockToChain.");
314  Blocks.push_back(BB);
315  BlockToChain[BB] = this;
316  return;
317  }
318 
319  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
320  assert(Chain->begin() != Chain->end());
321 
322  // Update the incoming blocks to point to this chain, and add them to the
323  // chain structure.
324  for (MachineBasicBlock *ChainBB : *Chain) {
325  Blocks.push_back(ChainBB);
326  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
327  BlockToChain[ChainBB] = this;
328  }
329  }
330 
331 #ifndef NDEBUG
332  /// Dump the blocks in this chain.
333  LLVM_DUMP_METHOD void dump() {
334  for (MachineBasicBlock *MBB : *this)
335  MBB->dump();
336  }
337 #endif // NDEBUG
338 
339  /// Count of predecessors of any block within the chain which have not
340  /// yet been scheduled. In general, we will delay scheduling this chain
341  /// until those predecessors are scheduled (or we find a sufficiently good
342  /// reason to override this heuristic.) Note that when forming loop chains,
343  /// blocks outside the loop are ignored and treated as if they were already
344  /// scheduled.
345  ///
346  /// Note: This field is reinitialized multiple times - once for each loop,
347  /// and then once for the function as a whole.
348  unsigned UnscheduledPredecessors = 0;
349 };
350 
351 class MachineBlockPlacement : public MachineFunctionPass {
352  /// A type for a block filter set.
353  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
354 
355  /// Pair struct containing basic block and taildup profitability
356  struct BlockAndTailDupResult {
358  bool ShouldTailDup;
359  };
360 
361  /// Triple struct containing edge weight and the edge.
362  struct WeightedEdge {
363  BlockFrequency Weight;
364  MachineBasicBlock *Src;
365  MachineBasicBlock *Dest;
366  };
367 
368  /// work lists of blocks that are ready to be laid out
371 
372  /// Edges that have already been computed as optimal.
374 
375  /// Machine Function
377 
378  /// A handle to the branch probability pass.
379  const MachineBranchProbabilityInfo *MBPI;
380 
381  /// A handle to the function-wide block frequency pass.
382  std::unique_ptr<MBFIWrapper> MBFI;
383 
384  /// A handle to the loop info.
385  MachineLoopInfo *MLI;
386 
387  /// Preferred loop exit.
388  /// Member variable for convenience. It may be removed by duplication deep
389  /// in the call stack.
390  MachineBasicBlock *PreferredLoopExit;
391 
392  /// A handle to the target's instruction info.
393  const TargetInstrInfo *TII;
394 
395  /// A handle to the target's lowering info.
396  const TargetLoweringBase *TLI;
397 
398  /// A handle to the post dominator tree.
400 
401  ProfileSummaryInfo *PSI;
402 
403  /// Duplicator used to duplicate tails during placement.
404  ///
405  /// Placement decisions can open up new tail duplication opportunities, but
406  /// since tail duplication affects placement decisions of later blocks, it
407  /// must be done inline.
408  TailDuplicator TailDup;
409 
410  /// Partial tail duplication threshold.
411  BlockFrequency DupThreshold;
412 
413  /// True: use block profile count to compute tail duplication cost.
414  /// False: use block frequency to compute tail duplication cost.
415  bool UseProfileCount;
416 
417  /// Allocator and owner of BlockChain structures.
418  ///
419  /// We build BlockChains lazily while processing the loop structure of
420  /// a function. To reduce malloc traffic, we allocate them using this
421  /// slab-like allocator, and destroy them after the pass completes. An
422  /// important guarantee is that this allocator produces stable pointers to
423  /// the chains.
425 
426  /// Function wide BasicBlock to BlockChain mapping.
427  ///
428  /// This mapping allows efficiently moving from any given basic block to the
429  /// BlockChain it participates in, if any. We use it to, among other things,
430  /// allow implicitly defining edges between chains as the existing edges
431  /// between basic blocks.
433 
434 #ifndef NDEBUG
435  /// The set of basic blocks that have terminators that cannot be fully
436  /// analyzed. These basic blocks cannot be re-ordered safely by
437  /// MachineBlockPlacement, and we must preserve physical layout of these
438  /// blocks and their successors through the pass.
439  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
440 #endif
441 
442  /// Get block profile count or frequency according to UseProfileCount.
443  /// The return value is used to model tail duplication cost.
444  BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
445  if (UseProfileCount) {
446  auto Count = MBFI->getBlockProfileCount(BB);
447  if (Count)
448  return *Count;
449  else
450  return 0;
451  } else
452  return MBFI->getBlockFreq(BB);
453  }
454 
455  /// Scale the DupThreshold according to basic block size.
456  BlockFrequency scaleThreshold(MachineBasicBlock *BB);
457  void initDupThreshold();
458 
459  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
460  /// if the count goes to 0, add them to the appropriate work list.
461  void markChainSuccessors(
462  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
463  const BlockFilterSet *BlockFilter = nullptr);
464 
465  /// Decrease the UnscheduledPredecessors count for a single block, and
466  /// if the count goes to 0, add them to the appropriate work list.
467  void markBlockSuccessors(
468  const BlockChain &Chain, const MachineBasicBlock *BB,
469  const MachineBasicBlock *LoopHeaderBB,
470  const BlockFilterSet *BlockFilter = nullptr);
471 
473  collectViableSuccessors(
474  const MachineBasicBlock *BB, const BlockChain &Chain,
475  const BlockFilterSet *BlockFilter,
477  bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
478  BlockFilterSet *BlockFilter);
479  void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
481  BlockFilterSet *BlockFilter);
482  bool repeatedlyTailDuplicateBlock(
484  const MachineBasicBlock *LoopHeaderBB,
485  BlockChain &Chain, BlockFilterSet *BlockFilter,
486  MachineFunction::iterator &PrevUnplacedBlockIt);
487  bool maybeTailDuplicateBlock(
489  BlockChain &Chain, BlockFilterSet *BlockFilter,
490  MachineFunction::iterator &PrevUnplacedBlockIt,
491  bool &DuplicatedToLPred);
492  bool hasBetterLayoutPredecessor(
493  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
494  const BlockChain &SuccChain, BranchProbability SuccProb,
495  BranchProbability RealSuccProb, const BlockChain &Chain,
496  const BlockFilterSet *BlockFilter);
497  BlockAndTailDupResult selectBestSuccessor(
498  const MachineBasicBlock *BB, const BlockChain &Chain,
499  const BlockFilterSet *BlockFilter);
500  MachineBasicBlock *selectBestCandidateBlock(
501  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
502  MachineBasicBlock *getFirstUnplacedBlock(
503  const BlockChain &PlacedChain,
504  MachineFunction::iterator &PrevUnplacedBlockIt,
505  const BlockFilterSet *BlockFilter);
506 
507  /// Add a basic block to the work list if it is appropriate.
508  ///
509  /// If the optional parameter BlockFilter is provided, only MBB
510  /// present in the set will be added to the worklist. If nullptr
511  /// is provided, no filtering occurs.
512  void fillWorkLists(const MachineBasicBlock *MBB,
513  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
514  const BlockFilterSet *BlockFilter);
515 
516  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
517  BlockFilterSet *BlockFilter = nullptr);
518  bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
519  const MachineBasicBlock *OldTop);
520  bool hasViableTopFallthrough(const MachineBasicBlock *Top,
521  const BlockFilterSet &LoopBlockSet);
522  BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
523  const BlockFilterSet &LoopBlockSet);
524  BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
525  const MachineBasicBlock *OldTop,
526  const MachineBasicBlock *ExitBB,
527  const BlockFilterSet &LoopBlockSet);
528  MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
529  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
530  MachineBasicBlock *findBestLoopTop(
531  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
532  MachineBasicBlock *findBestLoopExit(
533  const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
534  BlockFrequency &ExitFreq);
535  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
536  void buildLoopChains(const MachineLoop &L);
537  void rotateLoop(
538  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
539  BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
540  void rotateLoopWithProfile(
541  BlockChain &LoopChain, const MachineLoop &L,
542  const BlockFilterSet &LoopBlockSet);
543  void buildCFGChains();
544  void optimizeBranches();
545  void alignBlocks();
546  /// Returns true if a block should be tail-duplicated to increase fallthrough
547  /// opportunities.
548  bool shouldTailDuplicate(MachineBasicBlock *BB);
549  /// Check the edge frequencies to see if tail duplication will increase
550  /// fallthroughs.
551  bool isProfitableToTailDup(
552  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
553  BranchProbability QProb,
554  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
555 
556  /// Check for a trellis layout.
557  bool isTrellis(const MachineBasicBlock *BB,
558  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
559  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
560 
561  /// Get the best successor given a trellis layout.
562  BlockAndTailDupResult getBestTrellisSuccessor(
563  const MachineBasicBlock *BB,
564  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
565  BranchProbability AdjustedSumProb, const BlockChain &Chain,
566  const BlockFilterSet *BlockFilter);
567 
568  /// Get the best pair of non-conflicting edges.
569  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
570  const MachineBasicBlock *BB,
572 
573  /// Returns true if a block can tail duplicate into all unplaced
574  /// predecessors. Filters based on loop.
575  bool canTailDuplicateUnplacedPreds(
576  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
577  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
578 
579  /// Find chains of triangles to tail-duplicate where a global analysis works,
580  /// but a local analysis would not find them.
581  void precomputeTriangleChains();
582 
583  /// Apply a post-processing step optimizing block placement.
584  void applyExtTsp();
585 
586  /// Modify the existing block placement in the function and adjust all jumps.
587  void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
588 
589  /// Create a single CFG chain from the current block order.
590  void createCFGChainExtTsp();
591 
592 public:
593  static char ID; // Pass identification, replacement for typeid
594 
595  MachineBlockPlacement() : MachineFunctionPass(ID) {
597  }
598 
599  bool runOnMachineFunction(MachineFunction &F) override;
600 
601  bool allowTailDupPlacement() const {
602  assert(F);
603  return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
604  }
605 
606  void getAnalysisUsage(AnalysisUsage &AU) const override {
609  if (TailDupPlacement)
615  }
616 };
617 
618 } // end anonymous namespace
619 
621 
623 
624 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
625  "Branch Probability Basic Block Placement", false, false)
631 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
632  "Branch Probability Basic Block Placement", false, false)
633 
634 #ifndef NDEBUG
635 /// Helper to print the name of a MBB.
636 ///
637 /// Only used by debug logging.
638 static std::string getBlockName(const MachineBasicBlock *BB) {
639  std::string Result;
640  raw_string_ostream OS(Result);
641  OS << printMBBReference(*BB);
642  OS << " ('" << BB->getName() << "')";
643  OS.flush();
644  return Result;
645 }
646 #endif
647 
648 /// Mark a chain's successors as having one fewer preds.
649 ///
650 /// When a chain is being merged into the "placed" chain, this routine will
651 /// quickly walk the successors of each block in the chain and mark them as
652 /// having one fewer active predecessor. It also adds any successors of this
653 /// chain which reach the zero-predecessor state to the appropriate worklist.
654 void MachineBlockPlacement::markChainSuccessors(
655  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
656  const BlockFilterSet *BlockFilter) {
657  // Walk all the blocks in this chain, marking their successors as having
658  // a predecessor placed.
659  for (MachineBasicBlock *MBB : Chain) {
660  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
661  }
662 }
663 
664 /// Mark a single block's successors as having one fewer preds.
665 ///
666 /// Under normal circumstances, this is only called by markChainSuccessors,
667 /// but if a block that was to be placed is completely tail-duplicated away,
668 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
669 /// for just that block.
670 void MachineBlockPlacement::markBlockSuccessors(
671  const BlockChain &Chain, const MachineBasicBlock *MBB,
672  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
673  // Add any successors for which this is the only un-placed in-loop
674  // predecessor to the worklist as a viable candidate for CFG-neutral
675  // placement. No subsequent placement of this block will violate the CFG
676  // shape, so we get to use heuristics to choose a favorable placement.
677  for (MachineBasicBlock *Succ : MBB->successors()) {
678  if (BlockFilter && !BlockFilter->count(Succ))
679  continue;
680  BlockChain &SuccChain = *BlockToChain[Succ];
681  // Disregard edges within a fixed chain, or edges to the loop header.
682  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
683  continue;
684 
685  // This is a cross-chain edge that is within the loop, so decrement the
686  // loop predecessor count of the destination chain.
687  if (SuccChain.UnscheduledPredecessors == 0 ||
688  --SuccChain.UnscheduledPredecessors > 0)
689  continue;
690 
691  auto *NewBB = *SuccChain.begin();
692  if (NewBB->isEHPad())
693  EHPadWorkList.push_back(NewBB);
694  else
695  BlockWorkList.push_back(NewBB);
696  }
697 }
698 
699 /// This helper function collects the set of successors of block
700 /// \p BB that are allowed to be its layout successors, and return
701 /// the total branch probability of edges from \p BB to those
702 /// blocks.
703 BranchProbability MachineBlockPlacement::collectViableSuccessors(
704  const MachineBasicBlock *BB, const BlockChain &Chain,
705  const BlockFilterSet *BlockFilter,
707  // Adjust edge probabilities by excluding edges pointing to blocks that is
708  // either not in BlockFilter or is already in the current chain. Consider the
709  // following CFG:
710  //
711  // --->A
712  // | / \
713  // | B C
714  // | \ / \
715  // ----D E
716  //
717  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
718  // A->C is chosen as a fall-through, D won't be selected as a successor of C
719  // due to CFG constraint (the probability of C->D is not greater than
720  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
721  // when calculating the probability of C->D, D will be selected and we
722  // will get A C D B as the layout of this loop.
723  auto AdjustedSumProb = BranchProbability::getOne();
724  for (MachineBasicBlock *Succ : BB->successors()) {
725  bool SkipSucc = false;
726  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
727  SkipSucc = true;
728  } else {
729  BlockChain *SuccChain = BlockToChain[Succ];
730  if (SuccChain == &Chain) {
731  SkipSucc = true;
732  } else if (Succ != *SuccChain->begin()) {
733  LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
734  << " -> Mid chain!\n");
735  continue;
736  }
737  }
738  if (SkipSucc)
739  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
740  else
741  Successors.push_back(Succ);
742  }
743 
744  return AdjustedSumProb;
745 }
746 
747 /// The helper function returns the branch probability that is adjusted
748 /// or normalized over the new total \p AdjustedSumProb.
749 static BranchProbability
751  BranchProbability AdjustedSumProb) {
752  BranchProbability SuccProb;
753  uint32_t SuccProbN = OrigProb.getNumerator();
754  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
755  if (SuccProbN >= SuccProbD)
756  SuccProb = BranchProbability::getOne();
757  else
758  SuccProb = BranchProbability(SuccProbN, SuccProbD);
759 
760  return SuccProb;
761 }
762 
763 /// Check if \p BB has exactly the successors in \p Successors.
764 static bool
767  if (BB.succ_size() != Successors.size())
768  return false;
769  // We don't want to count self-loops
770  if (Successors.count(&BB))
771  return false;
772  for (MachineBasicBlock *Succ : BB.successors())
773  if (!Successors.count(Succ))
774  return false;
775  return true;
776 }
777 
778 /// Check if a block should be tail duplicated to increase fallthrough
779 /// opportunities.
780 /// \p BB Block to check.
781 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
782  // Blocks with single successors don't create additional fallthrough
783  // opportunities. Don't duplicate them. TODO: When conditional exits are
784  // analyzable, allow them to be duplicated.
785  bool IsSimple = TailDup.isSimpleBB(BB);
786 
787  if (BB->succ_size() == 1)
788  return false;
789  return TailDup.shouldTailDuplicate(IsSimple, *BB);
790 }
791 
792 /// Compare 2 BlockFrequency's with a small penalty for \p A.
793 /// In order to be conservative, we apply a X% penalty to account for
794 /// increased icache pressure and static heuristics. For small frequencies
795 /// we use only the numerators to improve accuracy. For simplicity, we assume the
796 /// penalty is less than 100%
797 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
799  uint64_t EntryFreq) {
800  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
801  BlockFrequency Gain = A - B;
802  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
803 }
804 
805 /// Check the edge frequencies to see if tail duplication will increase
806 /// fallthroughs. It only makes sense to call this function when
807 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
808 /// always locally profitable if we would have picked \p Succ without
809 /// considering duplication.
810 bool MachineBlockPlacement::isProfitableToTailDup(
811  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
812  BranchProbability QProb,
813  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
814  // We need to do a probability calculation to make sure this is profitable.
815  // First: does succ have a successor that post-dominates? This affects the
816  // calculation. The 2 relevant cases are:
817  // BB BB
818  // | \Qout | \Qout
819  // P| C |P C
820  // = C' = C'
821  // | /Qin | /Qin
822  // | / | /
823  // Succ Succ
824  // / \ | \ V
825  // U/ =V |U \
826  // / \ = D
827  // D E | /
828  // | /
829  // |/
830  // PDom
831  // '=' : Branch taken for that CFG edge
832  // In the second case, Placing Succ while duplicating it into C prevents the
833  // fallthrough of Succ into either D or PDom, because they now have C as an
834  // unplaced predecessor
835 
836  // Start by figuring out which case we fall into
837  MachineBasicBlock *PDom = nullptr;
839  // Only scan the relevant successors
840  auto AdjustedSuccSumProb =
841  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
842  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
843  auto BBFreq = MBFI->getBlockFreq(BB);
844  auto SuccFreq = MBFI->getBlockFreq(Succ);
845  BlockFrequency P = BBFreq * PProb;
846  BlockFrequency Qout = BBFreq * QProb;
847  uint64_t EntryFreq = MBFI->getEntryFreq();
848  // If there are no more successors, it is profitable to copy, as it strictly
849  // increases fallthrough.
850  if (SuccSuccs.size() == 0)
851  return greaterWithBias(P, Qout, EntryFreq);
852 
853  auto BestSuccSucc = BranchProbability::getZero();
854  // Find the PDom or the best Succ if no PDom exists.
855  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
856  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
857  if (Prob > BestSuccSucc)
858  BestSuccSucc = Prob;
859  if (PDom == nullptr)
860  if (MPDT->dominates(SuccSucc, Succ)) {
861  PDom = SuccSucc;
862  break;
863  }
864  }
865  // For the comparisons, we need to know Succ's best incoming edge that isn't
866  // from BB.
867  auto SuccBestPred = BlockFrequency(0);
868  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
869  if (SuccPred == Succ || SuccPred == BB
870  || BlockToChain[SuccPred] == &Chain
871  || (BlockFilter && !BlockFilter->count(SuccPred)))
872  continue;
873  auto Freq = MBFI->getBlockFreq(SuccPred)
874  * MBPI->getEdgeProbability(SuccPred, Succ);
875  if (Freq > SuccBestPred)
876  SuccBestPred = Freq;
877  }
878  // Qin is Succ's best unplaced incoming edge that isn't BB
879  BlockFrequency Qin = SuccBestPred;
880  // If it doesn't have a post-dominating successor, here is the calculation:
881  // BB BB
882  // | \Qout | \
883  // P| C | =
884  // = C' | C
885  // | /Qin | |
886  // | / | C' (+Succ)
887  // Succ Succ /|
888  // / \ | \/ |
889  // U/ =V | == |
890  // / \ | / \|
891  // D E D E
892  // '=' : Branch taken for that CFG edge
893  // Cost in the first case is: P + V
894  // For this calculation, we always assume P > Qout. If Qout > P
895  // The result of this function will be ignored at the caller.
896  // Let F = SuccFreq - Qin
897  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
898 
899  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
900  BranchProbability UProb = BestSuccSucc;
901  BranchProbability VProb = AdjustedSuccSumProb - UProb;
902  BlockFrequency F = SuccFreq - Qin;
903  BlockFrequency V = SuccFreq * VProb;
904  BlockFrequency QinU = std::min(Qin, F) * UProb;
905  BlockFrequency BaseCost = P + V;
906  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
907  return greaterWithBias(BaseCost, DupCost, EntryFreq);
908  }
909  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
910  BranchProbability VProb = AdjustedSuccSumProb - UProb;
911  BlockFrequency U = SuccFreq * UProb;
912  BlockFrequency V = SuccFreq * VProb;
913  BlockFrequency F = SuccFreq - Qin;
914  // If there is a post-dominating successor, here is the calculation:
915  // BB BB BB BB
916  // | \Qout | \ | \Qout | \
917  // |P C | = |P C | =
918  // = C' |P C = C' |P C
919  // | /Qin | | | /Qin | |
920  // | / | C' (+Succ) | / | C' (+Succ)
921  // Succ Succ /| Succ Succ /|
922  // | \ V | \/ | | \ V | \/ |
923  // |U \ |U /\ =? |U = |U /\ |
924  // = D = = =?| | D | = =|
925  // | / |/ D | / |/ D
926  // | / | / | = | /
927  // |/ | / |/ | =
928  // Dom Dom Dom Dom
929  // '=' : Branch taken for that CFG edge
930  // The cost for taken branches in the first case is P + U
931  // Let F = SuccFreq - Qin
932  // The cost in the second case (assuming independence), given the layout:
933  // BB, Succ, (C+Succ), D, Dom or the layout:
934  // BB, Succ, D, Dom, (C+Succ)
935  // is Qout + max(F, Qin) * U + min(F, Qin)
936  // compare P + U vs Qout + P * U + Qin.
937  //
938  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
939  //
940  // For the 3rd case, the cost is P + 2 * V
941  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
942  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
943  if (UProb > AdjustedSuccSumProb / 2 &&
944  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
945  Chain, BlockFilter))
946  // Cases 3 & 4
947  return greaterWithBias(
948  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
949  EntryFreq);
950  // Cases 1 & 2
951  return greaterWithBias((P + U),
952  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
953  std::max(Qin, F) * UProb),
954  EntryFreq);
955 }
956 
957 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
958 /// successors form the lower part of a trellis. A successor set S forms the
959 /// lower part of a trellis if all of the predecessors of S are either in S or
960 /// have all of S as successors. We ignore trellises where BB doesn't have 2
961 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
962 /// are very uncommon and complex to compute optimally. Allowing edges within S
963 /// is not strictly a trellis, but the same algorithm works, so we allow it.
964 bool MachineBlockPlacement::isTrellis(
965  const MachineBasicBlock *BB,
966  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
967  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
968  // Technically BB could form a trellis with branching factor higher than 2.
969  // But that's extremely uncommon.
970  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
971  return false;
972 
973  SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
974  BB->succ_end());
975  // To avoid reviewing the same predecessors twice.
977 
978  for (MachineBasicBlock *Succ : ViableSuccs) {
979  int PredCount = 0;
980  for (auto *SuccPred : Succ->predecessors()) {
981  // Allow triangle successors, but don't count them.
982  if (Successors.count(SuccPred)) {
983  // Make sure that it is actually a triangle.
984  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
985  if (!Successors.count(CheckSucc))
986  return false;
987  continue;
988  }
989  const BlockChain *PredChain = BlockToChain[SuccPred];
990  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
991  PredChain == &Chain || PredChain == BlockToChain[Succ])
992  continue;
993  ++PredCount;
994  // Perform the successor check only once.
995  if (!SeenPreds.insert(SuccPred).second)
996  continue;
997  if (!hasSameSuccessors(*SuccPred, Successors))
998  return false;
999  }
1000  // If one of the successors has only BB as a predecessor, it is not a
1001  // trellis.
1002  if (PredCount < 1)
1003  return false;
1004  }
1005  return true;
1006 }
1007 
1008 /// Pick the highest total weight pair of edges that can both be laid out.
1009 /// The edges in \p Edges[0] are assumed to have a different destination than
1010 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1011 /// the individual highest weight edges to the 2 different destinations, or in
1012 /// case of a conflict, one of them should be replaced with a 2nd best edge.
1013 std::pair<MachineBlockPlacement::WeightedEdge,
1014  MachineBlockPlacement::WeightedEdge>
1015 MachineBlockPlacement::getBestNonConflictingEdges(
1016  const MachineBasicBlock *BB,
1018  Edges) {
1019  // Sort the edges, and then for each successor, find the best incoming
1020  // predecessor. If the best incoming predecessors aren't the same,
1021  // then that is clearly the best layout. If there is a conflict, one of the
1022  // successors will have to fallthrough from the second best predecessor. We
1023  // compare which combination is better overall.
1024 
1025  // Sort for highest frequency.
1026  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1027 
1028  llvm::stable_sort(Edges[0], Cmp);
1029  llvm::stable_sort(Edges[1], Cmp);
1030  auto BestA = Edges[0].begin();
1031  auto BestB = Edges[1].begin();
1032  // Arrange for the correct answer to be in BestA and BestB
1033  // If the 2 best edges don't conflict, the answer is already there.
1034  if (BestA->Src == BestB->Src) {
1035  // Compare the total fallthrough of (Best + Second Best) for both pairs
1036  auto SecondBestA = std::next(BestA);
1037  auto SecondBestB = std::next(BestB);
1038  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1039  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1040  if (BestAScore < BestBScore)
1041  BestA = SecondBestA;
1042  else
1043  BestB = SecondBestB;
1044  }
1045  // Arrange for the BB edge to be in BestA if it exists.
1046  if (BestB->Src == BB)
1047  std::swap(BestA, BestB);
1048  return std::make_pair(*BestA, *BestB);
1049 }
1050 
1051 /// Get the best successor from \p BB based on \p BB being part of a trellis.
1052 /// We only handle trellises with 2 successors, so the algorithm is
1053 /// straightforward: Find the best pair of edges that don't conflict. We find
1054 /// the best incoming edge for each successor in the trellis. If those conflict,
1055 /// we consider which of them should be replaced with the second best.
1056 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
1057 /// comes from \p BB, it will be in \p BestEdges[0]
1058 MachineBlockPlacement::BlockAndTailDupResult
1059 MachineBlockPlacement::getBestTrellisSuccessor(
1060  const MachineBasicBlock *BB,
1061  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1062  BranchProbability AdjustedSumProb, const BlockChain &Chain,
1063  const BlockFilterSet *BlockFilter) {
1064 
1065  BlockAndTailDupResult Result = {nullptr, false};
1066  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
1067  BB->succ_end());
1068 
1069  // We assume size 2 because it's common. For general n, we would have to do
1070  // the Hungarian algorithm, but it's not worth the complexity because more
1071  // than 2 successors is fairly uncommon, and a trellis even more so.
1072  if (Successors.size() != 2 || ViableSuccs.size() != 2)
1073  return Result;
1074 
1075  // Collect the edge frequencies of all edges that form the trellis.
1077  int SuccIndex = 0;
1078  for (auto *Succ : ViableSuccs) {
1079  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1080  // Skip any placed predecessors that are not BB
1081  if (SuccPred != BB)
1082  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1083  BlockToChain[SuccPred] == &Chain ||
1084  BlockToChain[SuccPred] == BlockToChain[Succ])
1085  continue;
1086  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1087  MBPI->getEdgeProbability(SuccPred, Succ);
1088  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1089  }
1090  ++SuccIndex;
1091  }
1092 
1093  // Pick the best combination of 2 edges from all the edges in the trellis.
1094  WeightedEdge BestA, BestB;
1095  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1096 
1097  if (BestA.Src != BB) {
1098  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1099  // we shouldn't choose any successor. We've already looked and there's a
1100  // better fallthrough edge for all the successors.
1101  LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1102  return Result;
1103  }
1104 
1105  // Did we pick the triangle edge? If tail-duplication is profitable, do
1106  // that instead. Otherwise merge the triangle edge now while we know it is
1107  // optimal.
1108  if (BestA.Dest == BestB.Src) {
1109  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1110  // would be better.
1111  MachineBasicBlock *Succ1 = BestA.Dest;
1112  MachineBasicBlock *Succ2 = BestB.Dest;
1113  // Check to see if tail-duplication would be profitable.
1114  if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1115  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1116  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1117  Chain, BlockFilter)) {
1119  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1120  dbgs() << " Selected: " << getBlockName(Succ2)
1121  << ", probability: " << Succ2Prob
1122  << " (Tail Duplicate)\n");
1123  Result.BB = Succ2;
1124  Result.ShouldTailDup = true;
1125  return Result;
1126  }
1127  }
1128  // We have already computed the optimal edge for the other side of the
1129  // trellis.
1130  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1131 
1132  auto TrellisSucc = BestA.Dest;
1134  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1135  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1136  << ", probability: " << SuccProb << " (Trellis)\n");
1137  Result.BB = TrellisSucc;
1138  return Result;
1139 }
1140 
1141 /// When the option allowTailDupPlacement() is on, this method checks if the
1142 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1143 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1144 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1145  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1146  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1147  if (!shouldTailDuplicate(Succ))
1148  return false;
1149 
1150  // The result of canTailDuplicate.
1151  bool Duplicate = true;
1152  // Number of possible duplication.
1153  unsigned int NumDup = 0;
1154 
1155  // For CFG checking.
1156  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
1157  BB->succ_end());
1158  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1159  // Make sure all unplaced and unfiltered predecessors can be
1160  // tail-duplicated into.
1161  // Skip any blocks that are already placed or not in this loop.
1162  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1163  || BlockToChain[Pred] == &Chain)
1164  continue;
1165  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1166  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1167  // This will result in a trellis after tail duplication, so we don't
1168  // need to copy Succ into this predecessor. In the presence
1169  // of a trellis tail duplication can continue to be profitable.
1170  // For example:
1171  // A A
1172  // |\ |\
1173  // | \ | \
1174  // | C | C+BB
1175  // | / | |
1176  // |/ | |
1177  // BB => BB |
1178  // |\ |\/|
1179  // | \ |/\|
1180  // | D | D
1181  // | / | /
1182  // |/ |/
1183  // Succ Succ
1184  //
1185  // After BB was duplicated into C, the layout looks like the one on the
1186  // right. BB and C now have the same successors. When considering
1187  // whether Succ can be duplicated into all its unplaced predecessors, we
1188  // ignore C.
1189  // We can do this because C already has a profitable fallthrough, namely
1190  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1191  // duplication and for this test.
1192  //
1193  // This allows trellises to be laid out in 2 separate chains
1194  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1195  // because it allows the creation of 2 fallthrough paths with links
1196  // between them, and we correctly identify the best layout for these
1197  // CFGs. We want to extend trellises that the user created in addition
1198  // to trellises created by tail-duplication, so we just look for the
1199  // CFG.
1200  continue;
1201  Duplicate = false;
1202  continue;
1203  }
1204  NumDup++;
1205  }
1206 
1207  // No possible duplication in current filter set.
1208  if (NumDup == 0)
1209  return false;
1210 
1211  // If profile information is available, findDuplicateCandidates can do more
1212  // precise benefit analysis.
1213  if (F->getFunction().hasProfileData())
1214  return true;
1215 
1216  // This is mainly for function exit BB.
1217  // The integrated tail duplication is really designed for increasing
1218  // fallthrough from predecessors from Succ to its successors. We may need
1219  // other machanism to handle different cases.
1220  if (Succ->succ_empty())
1221  return true;
1222 
1223  // Plus the already placed predecessor.
1224  NumDup++;
1225 
1226  // If the duplication candidate has more unplaced predecessors than
1227  // successors, the extra duplication can't bring more fallthrough.
1228  //
1229  // Pred1 Pred2 Pred3
1230  // \ | /
1231  // \ | /
1232  // \ | /
1233  // Dup
1234  // / \
1235  // / \
1236  // Succ1 Succ2
1237  //
1238  // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1239  // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1240  // but the duplication into Pred3 can't increase fallthrough.
1241  //
1242  // A small number of extra duplication may not hurt too much. We need a better
1243  // heuristic to handle it.
1244  if ((NumDup > Succ->succ_size()) || !Duplicate)
1245  return false;
1246 
1247  return true;
1248 }
1249 
1250 /// Find chains of triangles where we believe it would be profitable to
1251 /// tail-duplicate them all, but a local analysis would not find them.
1252 /// There are 3 ways this can be profitable:
1253 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1254 /// longer chains)
1255 /// 2) The chains are statically correlated. Branch probabilities have a very
1256 /// U-shaped distribution.
1257 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1258 /// If the branches in a chain are likely to be from the same side of the
1259 /// distribution as their predecessor, but are independent at runtime, this
1260 /// transformation is profitable. (Because the cost of being wrong is a small
1261 /// fixed cost, unlike the standard triangle layout where the cost of being
1262 /// wrong scales with the # of triangles.)
1263 /// 3) The chains are dynamically correlated. If the probability that a previous
1264 /// branch was taken positively influences whether the next branch will be
1265 /// taken
1266 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1267 void MachineBlockPlacement::precomputeTriangleChains() {
1268  struct TriangleChain {
1269  std::vector<MachineBasicBlock *> Edges;
1270 
1271  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1272  : Edges({src, dst}) {}
1273 
1274  void append(MachineBasicBlock *dst) {
1275  assert(getKey()->isSuccessor(dst) &&
1276  "Attempting to append a block that is not a successor.");
1277  Edges.push_back(dst);
1278  }
1279 
1280  unsigned count() const { return Edges.size() - 1; }
1281 
1282  MachineBasicBlock *getKey() const {
1283  return Edges.back();
1284  }
1285  };
1286 
1287  if (TriangleChainCount == 0)
1288  return;
1289 
1290  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1291  // Map from last block to the chain that contains it. This allows us to extend
1292  // chains as we find new triangles.
1294  for (MachineBasicBlock &BB : *F) {
1295  // If BB doesn't have 2 successors, it doesn't start a triangle.
1296  if (BB.succ_size() != 2)
1297  continue;
1298  MachineBasicBlock *PDom = nullptr;
1299  for (MachineBasicBlock *Succ : BB.successors()) {
1300  if (!MPDT->dominates(Succ, &BB))
1301  continue;
1302  PDom = Succ;
1303  break;
1304  }
1305  // If BB doesn't have a post-dominating successor, it doesn't form a
1306  // triangle.
1307  if (PDom == nullptr)
1308  continue;
1309  // If PDom has a hint that it is low probability, skip this triangle.
1310  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1311  continue;
1312  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1313  // we're looking for.
1314  if (!shouldTailDuplicate(PDom))
1315  continue;
1316  bool CanTailDuplicate = true;
1317  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1318  // isn't the kind of triangle we're looking for.
1319  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1320  if (Pred == &BB)
1321  continue;
1322  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1323  CanTailDuplicate = false;
1324  break;
1325  }
1326  }
1327  // If we can't tail-duplicate PDom to its predecessors, then skip this
1328  // triangle.
1329  if (!CanTailDuplicate)
1330  continue;
1331 
1332  // Now we have an interesting triangle. Insert it if it's not part of an
1333  // existing chain.
1334  // Note: This cannot be replaced with a call insert() or emplace() because
1335  // the find key is BB, but the insert/emplace key is PDom.
1336  auto Found = TriangleChainMap.find(&BB);
1337  // If it is, remove the chain from the map, grow it, and put it back in the
1338  // map with the end as the new key.
1339  if (Found != TriangleChainMap.end()) {
1340  TriangleChain Chain = std::move(Found->second);
1341  TriangleChainMap.erase(Found);
1342  Chain.append(PDom);
1343  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1344  } else {
1345  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1346  assert(InsertResult.second && "Block seen twice.");
1347  (void)InsertResult;
1348  }
1349  }
1350 
1351  // Iterating over a DenseMap is safe here, because the only thing in the body
1352  // of the loop is inserting into another DenseMap (ComputedEdges).
1353  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1354  for (auto &ChainPair : TriangleChainMap) {
1355  TriangleChain &Chain = ChainPair.second;
1356  // Benchmarking has shown that due to branch correlation duplicating 2 or
1357  // more triangles is profitable, despite the calculations assuming
1358  // independence.
1359  if (Chain.count() < TriangleChainCount)
1360  continue;
1361  MachineBasicBlock *dst = Chain.Edges.back();
1362  Chain.Edges.pop_back();
1363  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1364  LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1365  << getBlockName(dst)
1366  << " as pre-computed based on triangles.\n");
1367 
1368  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1369  assert(InsertResult.second && "Block seen twice.");
1370  (void)InsertResult;
1371 
1372  dst = src;
1373  }
1374  }
1375 }
1376 
1377 // When profile is not present, return the StaticLikelyProb.
1378 // When profile is available, we need to handle the triangle-shape CFG.
1380  const MachineBasicBlock *BB) {
1381  if (!BB->getParent()->getFunction().hasProfileData())
1382  return BranchProbability(StaticLikelyProb, 100);
1383  if (BB->succ_size() == 2) {
1384  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1385  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1386  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1387  /* See case 1 below for the cost analysis. For BB->Succ to
1388  * be taken with smaller cost, the following needs to hold:
1389  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1390  * So the threshold T in the calculation below
1391  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1392  * So T / (1 - T) = 2, Yielding T = 2/3
1393  * Also adding user specified branch bias, we have
1394  * T = (2/3)*(ProfileLikelyProb/50)
1395  * = (2*ProfileLikelyProb)/150)
1396  */
1397  return BranchProbability(2 * ProfileLikelyProb, 150);
1398  }
1399  }
1400  return BranchProbability(ProfileLikelyProb, 100);
1401 }
1402 
1403 /// Checks to see if the layout candidate block \p Succ has a better layout
1404 /// predecessor than \c BB. If yes, returns true.
1405 /// \p SuccProb: The probability adjusted for only remaining blocks.
1406 /// Only used for logging
1407 /// \p RealSuccProb: The un-adjusted probability.
1408 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1409 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1410 /// considered
1411 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1412  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1413  const BlockChain &SuccChain, BranchProbability SuccProb,
1414  BranchProbability RealSuccProb, const BlockChain &Chain,
1415  const BlockFilterSet *BlockFilter) {
1416 
1417  // There isn't a better layout when there are no unscheduled predecessors.
1418  if (SuccChain.UnscheduledPredecessors == 0)
1419  return false;
1420 
1421  // There are two basic scenarios here:
1422  // -------------------------------------
1423  // Case 1: triangular shape CFG (if-then):
1424  // BB
1425  // | \
1426  // | \
1427  // | Pred
1428  // | /
1429  // Succ
1430  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1431  // set Succ as the layout successor of BB. Picking Succ as BB's
1432  // successor breaks the CFG constraints (FIXME: define these constraints).
1433  // With this layout, Pred BB
1434  // is forced to be outlined, so the overall cost will be cost of the
1435  // branch taken from BB to Pred, plus the cost of back taken branch
1436  // from Pred to Succ, as well as the additional cost associated
1437  // with the needed unconditional jump instruction from Pred To Succ.
1438 
1439  // The cost of the topological order layout is the taken branch cost
1440  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1441  // must hold:
1442  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1443  // < freq(BB->Succ) * taken_branch_cost.
1444  // Ignoring unconditional jump cost, we get
1445  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1446  // prob(BB->Succ) > 2 * prob(BB->Pred)
1447  //
1448  // When real profile data is available, we can precisely compute the
1449  // probability threshold that is needed for edge BB->Succ to be considered.
1450  // Without profile data, the heuristic requires the branch bias to be
1451  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1452  // -----------------------------------------------------------------
1453  // Case 2: diamond like CFG (if-then-else):
1454  // S
1455  // / \
1456  // | \
1457  // BB Pred
1458  // \ /
1459  // Succ
1460  // ..
1461  //
1462  // The current block is BB and edge BB->Succ is now being evaluated.
1463  // Note that edge S->BB was previously already selected because
1464  // prob(S->BB) > prob(S->Pred).
1465  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1466  // choose Pred, we will have a topological ordering as shown on the left
1467  // in the picture below. If we choose Succ, we have the solution as shown
1468  // on the right:
1469  //
1470  // topo-order:
1471  //
1472  // S----- ---S
1473  // | | | |
1474  // ---BB | | BB
1475  // | | | |
1476  // | Pred-- | Succ--
1477  // | | | |
1478  // ---Succ ---Pred--
1479  //
1480  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1481  // = freq(S->Pred) + freq(S->BB)
1482  //
1483  // If we have profile data (i.e, branch probabilities can be trusted), the
1484  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1485  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1486  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1487  // means the cost of topological order is greater.
1488  // When profile data is not available, however, we need to be more
1489  // conservative. If the branch prediction is wrong, breaking the topo-order
1490  // will actually yield a layout with large cost. For this reason, we need
1491  // strong biased branch at block S with Prob(S->BB) in order to select
1492  // BB->Succ. This is equivalent to looking the CFG backward with backward
1493  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1494  // profile data).
1495  // --------------------------------------------------------------------------
1496  // Case 3: forked diamond
1497  // S
1498  // / \
1499  // / \
1500  // BB Pred
1501  // | \ / |
1502  // | \ / |
1503  // | X |
1504  // | / \ |
1505  // | / \ |
1506  // S1 S2
1507  //
1508  // The current block is BB and edge BB->S1 is now being evaluated.
1509  // As above S->BB was already selected because
1510  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1511  //
1512  // topo-order:
1513  //
1514  // S-------| ---S
1515  // | | | |
1516  // ---BB | | BB
1517  // | | | |
1518  // | Pred----| | S1----
1519  // | | | |
1520  // --(S1 or S2) ---Pred--
1521  // |
1522  // S2
1523  //
1524  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1525  // + min(freq(Pred->S1), freq(Pred->S2))
1526  // Non-topo-order cost:
1527  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1528  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1529  // is 0. Then the non topo layout is better when
1530  // freq(S->Pred) < freq(BB->S1).
1531  // This is exactly what is checked below.
1532  // Note there are other shapes that apply (Pred may not be a single block,
1533  // but they all fit this general pattern.)
1535 
1536  // Make sure that a hot successor doesn't have a globally more
1537  // important predecessor.
1538  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1539  bool BadCFGConflict = false;
1540 
1541  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1542  BlockChain *PredChain = BlockToChain[Pred];
1543  if (Pred == Succ || PredChain == &SuccChain ||
1544  (BlockFilter && !BlockFilter->count(Pred)) ||
1545  PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
1546  // This check is redundant except for look ahead. This function is
1547  // called for lookahead by isProfitableToTailDup when BB hasn't been
1548  // placed yet.
1549  (Pred == BB))
1550  continue;
1551  // Do backward checking.
1552  // For all cases above, we need a backward checking to filter out edges that
1553  // are not 'strongly' biased.
1554  // BB Pred
1555  // \ /
1556  // Succ
1557  // We select edge BB->Succ if
1558  // freq(BB->Succ) > freq(Succ) * HotProb
1559  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1560  // HotProb
1561  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1562  // Case 1 is covered too, because the first equation reduces to:
1563  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1564  BlockFrequency PredEdgeFreq =
1565  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1566  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1567  BadCFGConflict = true;
1568  break;
1569  }
1570  }
1571 
1572  if (BadCFGConflict) {
1573  LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1574  << SuccProb << " (prob) (non-cold CFG conflict)\n");
1575  return true;
1576  }
1577 
1578  return false;
1579 }
1580 
1581 /// Select the best successor for a block.
1582 ///
1583 /// This looks across all successors of a particular block and attempts to
1584 /// select the "best" one to be the layout successor. It only considers direct
1585 /// successors which also pass the block filter. It will attempt to avoid
1586 /// breaking CFG structure, but cave and break such structures in the case of
1587 /// very hot successor edges.
1588 ///
1589 /// \returns The best successor block found, or null if none are viable, along
1590 /// with a boolean indicating if tail duplication is necessary.
1591 MachineBlockPlacement::BlockAndTailDupResult
1592 MachineBlockPlacement::selectBestSuccessor(
1593  const MachineBasicBlock *BB, const BlockChain &Chain,
1594  const BlockFilterSet *BlockFilter) {
1595  const BranchProbability HotProb(StaticLikelyProb, 100);
1596 
1597  BlockAndTailDupResult BestSucc = { nullptr, false };
1598  auto BestProb = BranchProbability::getZero();
1599 
1601  auto AdjustedSumProb =
1602  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1603 
1604  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1605  << "\n");
1606 
1607  // if we already precomputed the best successor for BB, return that if still
1608  // applicable.
1609  auto FoundEdge = ComputedEdges.find(BB);
1610  if (FoundEdge != ComputedEdges.end()) {
1611  MachineBasicBlock *Succ = FoundEdge->second.BB;
1612  ComputedEdges.erase(FoundEdge);
1613  BlockChain *SuccChain = BlockToChain[Succ];
1614  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1615  SuccChain != &Chain && Succ == *SuccChain->begin())
1616  return FoundEdge->second;
1617  }
1618 
1619  // if BB is part of a trellis, Use the trellis to determine the optimal
1620  // fallthrough edges
1621  if (isTrellis(BB, Successors, Chain, BlockFilter))
1622  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1623  BlockFilter);
1624 
1625  // For blocks with CFG violations, we may be able to lay them out anyway with
1626  // tail-duplication. We keep this vector so we can perform the probability
1627  // calculations the minimum number of times.
1629  DupCandidates;
1630  for (MachineBasicBlock *Succ : Successors) {
1631  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1632  BranchProbability SuccProb =
1633  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1634 
1635  BlockChain &SuccChain = *BlockToChain[Succ];
1636  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1637  // predecessor that yields lower global cost.
1638  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1639  Chain, BlockFilter)) {
1640  // If tail duplication would make Succ profitable, place it.
1641  if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1642  DupCandidates.emplace_back(SuccProb, Succ);
1643  continue;
1644  }
1645 
1646  LLVM_DEBUG(
1647  dbgs() << " Candidate: " << getBlockName(Succ)
1648  << ", probability: " << SuccProb
1649  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1650  << "\n");
1651 
1652  if (BestSucc.BB && BestProb >= SuccProb) {
1653  LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1654  continue;
1655  }
1656 
1657  LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1658  BestSucc.BB = Succ;
1659  BestProb = SuccProb;
1660  }
1661  // Handle the tail duplication candidates in order of decreasing probability.
1662  // Stop at the first one that is profitable. Also stop if they are less
1663  // profitable than BestSucc. Position is important because we preserve it and
1664  // prefer first best match. Here we aren't comparing in order, so we capture
1665  // the position instead.
1666  llvm::stable_sort(DupCandidates,
1667  [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1668  std::tuple<BranchProbability, MachineBasicBlock *> R) {
1669  return std::get<0>(L) > std::get<0>(R);
1670  });
1671  for (auto &Tup : DupCandidates) {
1672  BranchProbability DupProb;
1673  MachineBasicBlock *Succ;
1674  std::tie(DupProb, Succ) = Tup;
1675  if (DupProb < BestProb)
1676  break;
1677  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1678  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1679  LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1680  << ", probability: " << DupProb
1681  << " (Tail Duplicate)\n");
1682  BestSucc.BB = Succ;
1683  BestSucc.ShouldTailDup = true;
1684  break;
1685  }
1686  }
1687 
1688  if (BestSucc.BB)
1689  LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1690 
1691  return BestSucc;
1692 }
1693 
1694 /// Select the best block from a worklist.
1695 ///
1696 /// This looks through the provided worklist as a list of candidate basic
1697 /// blocks and select the most profitable one to place. The definition of
1698 /// profitable only really makes sense in the context of a loop. This returns
1699 /// the most frequently visited block in the worklist, which in the case of
1700 /// a loop, is the one most desirable to be physically close to the rest of the
1701 /// loop body in order to improve i-cache behavior.
1702 ///
1703 /// \returns The best block found, or null if none are viable.
1704 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1705  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1706  // Once we need to walk the worklist looking for a candidate, cleanup the
1707  // worklist of already placed entries.
1708  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1709  // some code complexity) into the loop below.
1710  llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1711  return BlockToChain.lookup(BB) == &Chain;
1712  });
1713 
1714  if (WorkList.empty())
1715  return nullptr;
1716 
1717  bool IsEHPad = WorkList[0]->isEHPad();
1718 
1719  MachineBasicBlock *BestBlock = nullptr;
1720  BlockFrequency BestFreq;
1721  for (MachineBasicBlock *MBB : WorkList) {
1722  assert(MBB->isEHPad() == IsEHPad &&
1723  "EHPad mismatch between block and work list.");
1724 
1725  BlockChain &SuccChain = *BlockToChain[MBB];
1726  if (&SuccChain == &Chain)
1727  continue;
1728 
1729  assert(SuccChain.UnscheduledPredecessors == 0 &&
1730  "Found CFG-violating block");
1731 
1732  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1733  LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1734  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1735 
1736  // For ehpad, we layout the least probable first as to avoid jumping back
1737  // from least probable landingpads to more probable ones.
1738  //
1739  // FIXME: Using probability is probably (!) not the best way to achieve
1740  // this. We should probably have a more principled approach to layout
1741  // cleanup code.
1742  //
1743  // The goal is to get:
1744  //
1745  // +--------------------------+
1746  // | V
1747  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1748  //
1749  // Rather than:
1750  //
1751  // +-------------------------------------+
1752  // V |
1753  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1754  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1755  continue;
1756 
1757  BestBlock = MBB;
1758  BestFreq = CandidateFreq;
1759  }
1760 
1761  return BestBlock;
1762 }
1763 
1764 /// Retrieve the first unplaced basic block.
1765 ///
1766 /// This routine is called when we are unable to use the CFG to walk through
1767 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1768 /// We walk through the function's blocks in order, starting from the
1769 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1770 /// re-scanning the entire sequence on repeated calls to this routine.
1771 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1772  const BlockChain &PlacedChain,
1773  MachineFunction::iterator &PrevUnplacedBlockIt,
1774  const BlockFilterSet *BlockFilter) {
1775  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1776  ++I) {
1777  if (BlockFilter && !BlockFilter->count(&*I))
1778  continue;
1779  if (BlockToChain[&*I] != &PlacedChain) {
1780  PrevUnplacedBlockIt = I;
1781  // Now select the head of the chain to which the unplaced block belongs
1782  // as the block to place. This will force the entire chain to be placed,
1783  // and satisfies the requirements of merging chains.
1784  return *BlockToChain[&*I]->begin();
1785  }
1786  }
1787  return nullptr;
1788 }
1789 
1790 void MachineBlockPlacement::fillWorkLists(
1791  const MachineBasicBlock *MBB,
1792  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1793  const BlockFilterSet *BlockFilter = nullptr) {
1794  BlockChain &Chain = *BlockToChain[MBB];
1795  if (!UpdatedPreds.insert(&Chain).second)
1796  return;
1797 
1798  assert(
1799  Chain.UnscheduledPredecessors == 0 &&
1800  "Attempting to place block with unscheduled predecessors in worklist.");
1801  for (MachineBasicBlock *ChainBB : Chain) {
1802  assert(BlockToChain[ChainBB] == &Chain &&
1803  "Block in chain doesn't match BlockToChain map.");
1804  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1805  if (BlockFilter && !BlockFilter->count(Pred))
1806  continue;
1807  if (BlockToChain[Pred] == &Chain)
1808  continue;
1809  ++Chain.UnscheduledPredecessors;
1810  }
1811  }
1812 
1813  if (Chain.UnscheduledPredecessors != 0)
1814  return;
1815 
1816  MachineBasicBlock *BB = *Chain.begin();
1817  if (BB->isEHPad())
1818  EHPadWorkList.push_back(BB);
1819  else
1820  BlockWorkList.push_back(BB);
1821 }
1822 
1823 void MachineBlockPlacement::buildChain(
1824  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1825  BlockFilterSet *BlockFilter) {
1826  assert(HeadBB && "BB must not be null.\n");
1827  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1828  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1829 
1830  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1831  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1832  MachineBasicBlock *BB = *std::prev(Chain.end());
1833  while (true) {
1834  assert(BB && "null block found at end of chain in loop.");
1835  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1836  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1837 
1838 
1839  // Look for the best viable successor if there is one to place immediately
1840  // after this block.
1841  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1842  MachineBasicBlock* BestSucc = Result.BB;
1843  bool ShouldTailDup = Result.ShouldTailDup;
1844  if (allowTailDupPlacement())
1845  ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc,
1846  Chain,
1847  BlockFilter));
1848 
1849  // If an immediate successor isn't available, look for the best viable
1850  // block among those we've identified as not violating the loop's CFG at
1851  // this point. This won't be a fallthrough, but it will increase locality.
1852  if (!BestSucc)
1853  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1854  if (!BestSucc)
1855  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1856 
1857  if (!BestSucc) {
1858  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1859  if (!BestSucc)
1860  break;
1861 
1862  LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1863  "layout successor until the CFG reduces\n");
1864  }
1865 
1866  // Placement may have changed tail duplication opportunities.
1867  // Check for that now.
1868  if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1869  repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1870  BlockFilter, PrevUnplacedBlockIt);
1871  // If the chosen successor was duplicated into BB, don't bother laying
1872  // it out, just go round the loop again with BB as the chain end.
1873  if (!BB->isSuccessor(BestSucc))
1874  continue;
1875  }
1876 
1877  // Place this block, updating the datastructures to reflect its placement.
1878  BlockChain &SuccChain = *BlockToChain[BestSucc];
1879  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1880  // we selected a successor that didn't fit naturally into the CFG.
1881  SuccChain.UnscheduledPredecessors = 0;
1882  LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1883  << getBlockName(BestSucc) << "\n");
1884  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1885  Chain.merge(BestSucc, &SuccChain);
1886  BB = *std::prev(Chain.end());
1887  }
1888 
1889  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1890  << getBlockName(*Chain.begin()) << "\n");
1891 }
1892 
1893 // If bottom of block BB has only one successor OldTop, in most cases it is
1894 // profitable to move it before OldTop, except the following case:
1895 //
1896 // -->OldTop<-
1897 // | . |
1898 // | . |
1899 // | . |
1900 // ---Pred |
1901 // | |
1902 // BB-----
1903 //
1904 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1905 // layout the other successor below it, so it can't reduce taken branch.
1906 // In this case we keep its original layout.
1907 bool
1908 MachineBlockPlacement::canMoveBottomBlockToTop(
1909  const MachineBasicBlock *BottomBlock,
1910  const MachineBasicBlock *OldTop) {
1911  if (BottomBlock->pred_size() != 1)
1912  return true;
1913  MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1914  if (Pred->succ_size() != 2)
1915  return true;
1916 
1917  MachineBasicBlock *OtherBB = *Pred->succ_begin();
1918  if (OtherBB == BottomBlock)
1919  OtherBB = *Pred->succ_rbegin();
1920  if (OtherBB == OldTop)
1921  return false;
1922 
1923  return true;
1924 }
1925 
1926 // Find out the possible fall through frequence to the top of a loop.
1928 MachineBlockPlacement::TopFallThroughFreq(
1929  const MachineBasicBlock *Top,
1930  const BlockFilterSet &LoopBlockSet) {
1931  BlockFrequency MaxFreq = 0;
1932  for (MachineBasicBlock *Pred : Top->predecessors()) {
1933  BlockChain *PredChain = BlockToChain[Pred];
1934  if (!LoopBlockSet.count(Pred) &&
1935  (!PredChain || Pred == *std::prev(PredChain->end()))) {
1936  // Found a Pred block can be placed before Top.
1937  // Check if Top is the best successor of Pred.
1938  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1939  bool TopOK = true;
1940  for (MachineBasicBlock *Succ : Pred->successors()) {
1941  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1942  BlockChain *SuccChain = BlockToChain[Succ];
1943  // Check if Succ can be placed after Pred.
1944  // Succ should not be in any chain, or it is the head of some chain.
1945  if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1946  (!SuccChain || Succ == *SuccChain->begin())) {
1947  TopOK = false;
1948  break;
1949  }
1950  }
1951  if (TopOK) {
1952  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1953  MBPI->getEdgeProbability(Pred, Top);
1954  if (EdgeFreq > MaxFreq)
1955  MaxFreq = EdgeFreq;
1956  }
1957  }
1958  }
1959  return MaxFreq;
1960 }
1961 
1962 // Compute the fall through gains when move NewTop before OldTop.
1963 //
1964 // In following diagram, edges marked as "-" are reduced fallthrough, edges
1965 // marked as "+" are increased fallthrough, this function computes
1966 //
1967 // SUM(increased fallthrough) - SUM(decreased fallthrough)
1968 //
1969 // |
1970 // | -
1971 // V
1972 // --->OldTop
1973 // | .
1974 // | .
1975 // +| . +
1976 // | Pred --->
1977 // | |-
1978 // | V
1979 // --- NewTop <---
1980 // |-
1981 // V
1982 //
1984 MachineBlockPlacement::FallThroughGains(
1985  const MachineBasicBlock *NewTop,
1986  const MachineBasicBlock *OldTop,
1987  const MachineBasicBlock *ExitBB,
1988  const BlockFilterSet &LoopBlockSet) {
1989  BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
1990  BlockFrequency FallThrough2Exit = 0;
1991  if (ExitBB)
1992  FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
1993  MBPI->getEdgeProbability(NewTop, ExitBB);
1994  BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
1995  MBPI->getEdgeProbability(NewTop, OldTop);
1996 
1997  // Find the best Pred of NewTop.
1998  MachineBasicBlock *BestPred = nullptr;
1999  BlockFrequency FallThroughFromPred = 0;
2000  for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2001  if (!LoopBlockSet.count(Pred))
2002  continue;
2003  BlockChain *PredChain = BlockToChain[Pred];
2004  if (!PredChain || Pred == *std::prev(PredChain->end())) {
2005  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
2006  MBPI->getEdgeProbability(Pred, NewTop);
2007  if (EdgeFreq > FallThroughFromPred) {
2008  FallThroughFromPred = EdgeFreq;
2009  BestPred = Pred;
2010  }
2011  }
2012  }
2013 
2014  // If NewTop is not placed after Pred, another successor can be placed
2015  // after Pred.
2016  BlockFrequency NewFreq = 0;
2017  if (BestPred) {
2018  for (MachineBasicBlock *Succ : BestPred->successors()) {
2019  if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2020  continue;
2021  if (ComputedEdges.find(Succ) != ComputedEdges.end())
2022  continue;
2023  BlockChain *SuccChain = BlockToChain[Succ];
2024  if ((SuccChain && (Succ != *SuccChain->begin())) ||
2025  (SuccChain == BlockToChain[BestPred]))
2026  continue;
2027  BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2028  MBPI->getEdgeProbability(BestPred, Succ);
2029  if (EdgeFreq > NewFreq)
2030  NewFreq = EdgeFreq;
2031  }
2032  BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2033  MBPI->getEdgeProbability(BestPred, NewTop);
2034  if (NewFreq > OrigEdgeFreq) {
2035  // If NewTop is not the best successor of Pred, then Pred doesn't
2036  // fallthrough to NewTop. So there is no FallThroughFromPred and
2037  // NewFreq.
2038  NewFreq = 0;
2039  FallThroughFromPred = 0;
2040  }
2041  }
2042 
2043  BlockFrequency Result = 0;
2044  BlockFrequency Gains = BackEdgeFreq + NewFreq;
2045  BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
2046  FallThroughFromPred;
2047  if (Gains > Lost)
2048  Result = Gains - Lost;
2049  return Result;
2050 }
2051 
2052 /// Helper function of findBestLoopTop. Find the best loop top block
2053 /// from predecessors of old top.
2054 ///
2055 /// Look for a block which is strictly better than the old top for laying
2056 /// out before the old top of the loop. This looks for only two patterns:
2057 ///
2058 /// 1. a block has only one successor, the old loop top
2059 ///
2060 /// Because such a block will always result in an unconditional jump,
2061 /// rotating it in front of the old top is always profitable.
2062 ///
2063 /// 2. a block has two successors, one is old top, another is exit
2064 /// and it has more than one predecessors
2065 ///
2066 /// If it is below one of its predecessors P, only P can fall through to
2067 /// it, all other predecessors need a jump to it, and another conditional
2068 /// jump to loop header. If it is moved before loop header, all its
2069 /// predecessors jump to it, then fall through to loop header. So all its
2070 /// predecessors except P can reduce one taken branch.
2071 /// At the same time, move it before old top increases the taken branch
2072 /// to loop exit block, so the reduced taken branch will be compared with
2073 /// the increased taken branch to the loop exit block.
2075 MachineBlockPlacement::findBestLoopTopHelper(
2076  MachineBasicBlock *OldTop,
2077  const MachineLoop &L,
2078  const BlockFilterSet &LoopBlockSet) {
2079  // Check that the header hasn't been fused with a preheader block due to
2080  // crazy branches. If it has, we need to start with the header at the top to
2081  // prevent pulling the preheader into the loop body.
2082  BlockChain &HeaderChain = *BlockToChain[OldTop];
2083  if (!LoopBlockSet.count(*HeaderChain.begin()))
2084  return OldTop;
2085  if (OldTop != *HeaderChain.begin())
2086  return OldTop;
2087 
2088  LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2089  << "\n");
2090 
2091  BlockFrequency BestGains = 0;
2092  MachineBasicBlock *BestPred = nullptr;
2093  for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2094  if (!LoopBlockSet.count(Pred))
2095  continue;
2096  if (Pred == L.getHeader())
2097  continue;
2098  LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2099  << Pred->succ_size() << " successors, ";
2100  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
2101  if (Pred->succ_size() > 2)
2102  continue;
2103 
2104  MachineBasicBlock *OtherBB = nullptr;
2105  if (Pred->succ_size() == 2) {
2106  OtherBB = *Pred->succ_begin();
2107  if (OtherBB == OldTop)
2108  OtherBB = *Pred->succ_rbegin();
2109  }
2110 
2111  if (!canMoveBottomBlockToTop(Pred, OldTop))
2112  continue;
2113 
2114  BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
2115  LoopBlockSet);
2116  if ((Gains > 0) && (Gains > BestGains ||
2117  ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2118  BestPred = Pred;
2119  BestGains = Gains;
2120  }
2121  }
2122 
2123  // If no direct predecessor is fine, just use the loop header.
2124  if (!BestPred) {
2125  LLVM_DEBUG(dbgs() << " final top unchanged\n");
2126  return OldTop;
2127  }
2128 
2129  // Walk backwards through any straight line of predecessors.
2130  while (BestPred->pred_size() == 1 &&
2131  (*BestPred->pred_begin())->succ_size() == 1 &&
2132  *BestPred->pred_begin() != L.getHeader())
2133  BestPred = *BestPred->pred_begin();
2134 
2135  LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2136  return BestPred;
2137 }
2138 
2139 /// Find the best loop top block for layout.
2140 ///
2141 /// This function iteratively calls findBestLoopTopHelper, until no new better
2142 /// BB can be found.
2144 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2145  const BlockFilterSet &LoopBlockSet) {
2146  // Placing the latch block before the header may introduce an extra branch
2147  // that skips this block the first time the loop is executed, which we want
2148  // to avoid when optimising for size.
2149  // FIXME: in theory there is a case that does not introduce a new branch,
2150  // i.e. when the layout predecessor does not fallthrough to the loop header.
2151  // In practice this never happens though: there always seems to be a preheader
2152  // that can fallthrough and that is also placed before the header.
2153  bool OptForSize = F->getFunction().hasOptSize() ||
2154  llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());
2155  if (OptForSize)
2156  return L.getHeader();
2157 
2158  MachineBasicBlock *OldTop = nullptr;
2159  MachineBasicBlock *NewTop = L.getHeader();
2160  while (NewTop != OldTop) {
2161  OldTop = NewTop;
2162  NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2163  if (NewTop != OldTop)
2164  ComputedEdges[NewTop] = { OldTop, false };
2165  }
2166  return NewTop;
2167 }
2168 
2169 /// Find the best loop exiting block for layout.
2170 ///
2171 /// This routine implements the logic to analyze the loop looking for the best
2172 /// block to layout at the top of the loop. Typically this is done to maximize
2173 /// fallthrough opportunities.
2175 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2176  const BlockFilterSet &LoopBlockSet,
2177  BlockFrequency &ExitFreq) {
2178  // We don't want to layout the loop linearly in all cases. If the loop header
2179  // is just a normal basic block in the loop, we want to look for what block
2180  // within the loop is the best one to layout at the top. However, if the loop
2181  // header has be pre-merged into a chain due to predecessors not having
2182  // analyzable branches, *and* the predecessor it is merged with is *not* part
2183  // of the loop, rotating the header into the middle of the loop will create
2184  // a non-contiguous range of blocks which is Very Bad. So start with the
2185  // header and only rotate if safe.
2186  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2187  if (!LoopBlockSet.count(*HeaderChain.begin()))
2188  return nullptr;
2189 
2190  BlockFrequency BestExitEdgeFreq;
2191  unsigned BestExitLoopDepth = 0;
2192  MachineBasicBlock *ExitingBB = nullptr;
2193  // If there are exits to outer loops, loop rotation can severely limit
2194  // fallthrough opportunities unless it selects such an exit. Keep a set of
2195  // blocks where rotating to exit with that block will reach an outer loop.
2196  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2197 
2198  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2199  << getBlockName(L.getHeader()) << "\n");
2200  for (MachineBasicBlock *MBB : L.getBlocks()) {
2201  BlockChain &Chain = *BlockToChain[MBB];
2202  // Ensure that this block is at the end of a chain; otherwise it could be
2203  // mid-way through an inner loop or a successor of an unanalyzable branch.
2204  if (MBB != *std::prev(Chain.end()))
2205  continue;
2206 
2207  // Now walk the successors. We need to establish whether this has a viable
2208  // exiting successor and whether it has a viable non-exiting successor.
2209  // We store the old exiting state and restore it if a viable looping
2210  // successor isn't found.
2211  MachineBasicBlock *OldExitingBB = ExitingBB;
2212  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2213  bool HasLoopingSucc = false;
2214  for (MachineBasicBlock *Succ : MBB->successors()) {
2215  if (Succ->isEHPad())
2216  continue;
2217  if (Succ == MBB)
2218  continue;
2219  BlockChain &SuccChain = *BlockToChain[Succ];
2220  // Don't split chains, either this chain or the successor's chain.
2221  if (&Chain == &SuccChain) {
2222  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2223  << getBlockName(Succ) << " (chain conflict)\n");
2224  continue;
2225  }
2226 
2227  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2228  if (LoopBlockSet.count(Succ)) {
2229  LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2230  << getBlockName(Succ) << " (" << SuccProb << ")\n");
2231  HasLoopingSucc = true;
2232  continue;
2233  }
2234 
2235  unsigned SuccLoopDepth = 0;
2236  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2237  SuccLoopDepth = ExitLoop->getLoopDepth();
2238  if (ExitLoop->contains(&L))
2239  BlocksExitingToOuterLoop.insert(MBB);
2240  }
2241 
2242  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2243  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2244  << getBlockName(Succ) << " [L:" << SuccLoopDepth
2245  << "] (";
2246  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
2247  // Note that we bias this toward an existing layout successor to retain
2248  // incoming order in the absence of better information. The exit must have
2249  // a frequency higher than the current exit before we consider breaking
2250  // the layout.
2251  BranchProbability Bias(100 - ExitBlockBias, 100);
2252  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2253  ExitEdgeFreq > BestExitEdgeFreq ||
2254  (MBB->isLayoutSuccessor(Succ) &&
2255  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2256  BestExitEdgeFreq = ExitEdgeFreq;
2257  ExitingBB = MBB;
2258  }
2259  }
2260 
2261  if (!HasLoopingSucc) {
2262  // Restore the old exiting state, no viable looping successor was found.
2263  ExitingBB = OldExitingBB;
2264  BestExitEdgeFreq = OldBestExitEdgeFreq;
2265  }
2266  }
2267  // Without a candidate exiting block or with only a single block in the
2268  // loop, just use the loop header to layout the loop.
2269  if (!ExitingBB) {
2270  LLVM_DEBUG(
2271  dbgs() << " No other candidate exit blocks, using loop header\n");
2272  return nullptr;
2273  }
2274  if (L.getNumBlocks() == 1) {
2275  LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2276  return nullptr;
2277  }
2278 
2279  // Also, if we have exit blocks which lead to outer loops but didn't select
2280  // one of them as the exiting block we are rotating toward, disable loop
2281  // rotation altogether.
2282  if (!BlocksExitingToOuterLoop.empty() &&
2283  !BlocksExitingToOuterLoop.count(ExitingBB))
2284  return nullptr;
2285 
2286  LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2287  << "\n");
2288  ExitFreq = BestExitEdgeFreq;
2289  return ExitingBB;
2290 }
2291 
2292 /// Check if there is a fallthrough to loop header Top.
2293 ///
2294 /// 1. Look for a Pred that can be layout before Top.
2295 /// 2. Check if Top is the most possible successor of Pred.
2296 bool
2297 MachineBlockPlacement::hasViableTopFallthrough(
2298  const MachineBasicBlock *Top,
2299  const BlockFilterSet &LoopBlockSet) {
2300  for (MachineBasicBlock *Pred : Top->predecessors()) {
2301  BlockChain *PredChain = BlockToChain[Pred];
2302  if (!LoopBlockSet.count(Pred) &&
2303  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2304  // Found a Pred block can be placed before Top.
2305  // Check if Top is the best successor of Pred.
2306  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2307  bool TopOK = true;
2308  for (MachineBasicBlock *Succ : Pred->successors()) {
2309  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2310  BlockChain *SuccChain = BlockToChain[Succ];
2311  // Check if Succ can be placed after Pred.
2312  // Succ should not be in any chain, or it is the head of some chain.
2313  if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2314  TopOK = false;
2315  break;
2316  }
2317  }
2318  if (TopOK)
2319  return true;
2320  }
2321  }
2322  return false;
2323 }
2324 
2325 /// Attempt to rotate an exiting block to the bottom of the loop.
2326 ///
2327 /// Once we have built a chain, try to rotate it to line up the hot exit block
2328 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2329 /// branches. For example, if the loop has fallthrough into its header and out
2330 /// of its bottom already, don't rotate it.
2331 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2332  const MachineBasicBlock *ExitingBB,
2333  BlockFrequency ExitFreq,
2334  const BlockFilterSet &LoopBlockSet) {
2335  if (!ExitingBB)
2336  return;
2337 
2338  MachineBasicBlock *Top = *LoopChain.begin();
2339  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2340 
2341  // If ExitingBB is already the last one in a chain then nothing to do.
2342  if (Bottom == ExitingBB)
2343  return;
2344 
2345  // The entry block should always be the first BB in a function.
2346  if (Top->isEntryBlock())
2347  return;
2348 
2349  bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2350 
2351  // If the header has viable fallthrough, check whether the current loop
2352  // bottom is a viable exiting block. If so, bail out as rotating will
2353  // introduce an unnecessary branch.
2354  if (ViableTopFallthrough) {
2355  for (MachineBasicBlock *Succ : Bottom->successors()) {
2356  BlockChain *SuccChain = BlockToChain[Succ];
2357  if (!LoopBlockSet.count(Succ) &&
2358  (!SuccChain || Succ == *SuccChain->begin()))
2359  return;
2360  }
2361 
2362  // Rotate will destroy the top fallthrough, we need to ensure the new exit
2363  // frequency is larger than top fallthrough.
2364  BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2365  if (FallThrough2Top >= ExitFreq)
2366  return;
2367  }
2368 
2369  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2370  if (ExitIt == LoopChain.end())
2371  return;
2372 
2373  // Rotating a loop exit to the bottom when there is a fallthrough to top
2374  // trades the entry fallthrough for an exit fallthrough.
2375  // If there is no bottom->top edge, but the chosen exit block does have
2376  // a fallthrough, we break that fallthrough for nothing in return.
2377 
2378  // Let's consider an example. We have a built chain of basic blocks
2379  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2380  // By doing a rotation we get
2381  // Bk+1, ..., Bn, B1, ..., Bk
2382  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2383  // If we had a fallthrough Bk -> Bk+1 it is broken now.
2384  // It might be compensated by fallthrough Bn -> B1.
2385  // So we have a condition to avoid creation of extra branch by loop rotation.
2386  // All below must be true to avoid loop rotation:
2387  // If there is a fallthrough to top (B1)
2388  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2389  // There is no fallthrough from bottom (Bn) to top (B1).
2390  // Please note that there is no exit fallthrough from Bn because we checked it
2391  // above.
2392  if (ViableTopFallthrough) {
2393  assert(std::next(ExitIt) != LoopChain.end() &&
2394  "Exit should not be last BB");
2395  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2396  if (ExitingBB->isSuccessor(NextBlockInChain))
2397  if (!Bottom->isSuccessor(Top))
2398  return;
2399  }
2400 
2401  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2402  << " at bottom\n");
2403  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2404 }
2405 
2406 /// Attempt to rotate a loop based on profile data to reduce branch cost.
2407 ///
2408 /// With profile data, we can determine the cost in terms of missed fall through
2409 /// opportunities when rotating a loop chain and select the best rotation.
2410 /// Basically, there are three kinds of cost to consider for each rotation:
2411 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2412 /// the loop to the loop header.
2413 /// 2. The possibly missed fall through edges (if they exist) from the loop
2414 /// exits to BB out of the loop.
2415 /// 3. The missed fall through edge (if it exists) from the last BB to the
2416 /// first BB in the loop chain.
2417 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2418 /// We select the best rotation with the smallest cost.
2419 void MachineBlockPlacement::rotateLoopWithProfile(
2420  BlockChain &LoopChain, const MachineLoop &L,
2421  const BlockFilterSet &LoopBlockSet) {
2422  auto RotationPos = LoopChain.end();
2423  MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2424 
2425  // The entry block should always be the first BB in a function.
2426  if (ChainHeaderBB->isEntryBlock())
2427  return;
2428 
2429  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2430 
2431  // A utility lambda that scales up a block frequency by dividing it by a
2432  // branch probability which is the reciprocal of the scale.
2433  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2434  unsigned Scale) -> BlockFrequency {
2435  if (Scale == 0)
2436  return 0;
2437  // Use operator / between BlockFrequency and BranchProbability to implement
2438  // saturating multiplication.
2439  return Freq / BranchProbability(1, Scale);
2440  };
2441 
2442  // Compute the cost of the missed fall-through edge to the loop header if the
2443  // chain head is not the loop header. As we only consider natural loops with
2444  // single header, this computation can be done only once.
2445  BlockFrequency HeaderFallThroughCost(0);
2446  for (auto *Pred : ChainHeaderBB->predecessors()) {
2447  BlockChain *PredChain = BlockToChain[Pred];
2448  if (!LoopBlockSet.count(Pred) &&
2449  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2450  auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2451  MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2452  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2453  // If the predecessor has only an unconditional jump to the header, we
2454  // need to consider the cost of this jump.
2455  if (Pred->succ_size() == 1)
2456  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2457  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2458  }
2459  }
2460 
2461  // Here we collect all exit blocks in the loop, and for each exit we find out
2462  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2463  // as the sum of frequencies of exit edges we collect here, excluding the exit
2464  // edge from the tail of the loop chain.
2466  for (auto *BB : LoopChain) {
2467  auto LargestExitEdgeProb = BranchProbability::getZero();
2468  for (auto *Succ : BB->successors()) {
2469  BlockChain *SuccChain = BlockToChain[Succ];
2470  if (!LoopBlockSet.count(Succ) &&
2471  (!SuccChain || Succ == *SuccChain->begin())) {
2472  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2473  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2474  }
2475  }
2476  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2477  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2478  ExitsWithFreq.emplace_back(BB, ExitFreq);
2479  }
2480  }
2481 
2482  // In this loop we iterate every block in the loop chain and calculate the
2483  // cost assuming the block is the head of the loop chain. When the loop ends,
2484  // we should have found the best candidate as the loop chain's head.
2485  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2486  EndIter = LoopChain.end();
2487  Iter != EndIter; Iter++, TailIter++) {
2488  // TailIter is used to track the tail of the loop chain if the block we are
2489  // checking (pointed by Iter) is the head of the chain.
2490  if (TailIter == LoopChain.end())
2491  TailIter = LoopChain.begin();
2492 
2493  auto TailBB = *TailIter;
2494 
2495  // Calculate the cost by putting this BB to the top.
2496  BlockFrequency Cost = 0;
2497 
2498  // If the current BB is the loop header, we need to take into account the
2499  // cost of the missed fall through edge from outside of the loop to the
2500  // header.
2501  if (Iter != LoopChain.begin())
2502  Cost += HeaderFallThroughCost;
2503 
2504  // Collect the loop exit cost by summing up frequencies of all exit edges
2505  // except the one from the chain tail.
2506  for (auto &ExitWithFreq : ExitsWithFreq)
2507  if (TailBB != ExitWithFreq.first)
2508  Cost += ExitWithFreq.second;
2509 
2510  // The cost of breaking the once fall-through edge from the tail to the top
2511  // of the loop chain. Here we need to consider three cases:
2512  // 1. If the tail node has only one successor, then we will get an
2513  // additional jmp instruction. So the cost here is (MisfetchCost +
2514  // JumpInstCost) * tail node frequency.
2515  // 2. If the tail node has two successors, then we may still get an
2516  // additional jmp instruction if the layout successor after the loop
2517  // chain is not its CFG successor. Note that the more frequently executed
2518  // jmp instruction will be put ahead of the other one. Assume the
2519  // frequency of those two branches are x and y, where x is the frequency
2520  // of the edge to the chain head, then the cost will be
2521  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2522  // 3. If the tail node has more than two successors (this rarely happens),
2523  // we won't consider any additional cost.
2524  if (TailBB->isSuccessor(*Iter)) {
2525  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2526  if (TailBB->succ_size() == 1)
2527  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2529  else if (TailBB->succ_size() == 2) {
2530  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2531  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2532  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2533  ? TailBBFreq * TailToHeadProb.getCompl()
2534  : TailToHeadFreq;
2535  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2536  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2537  }
2538  }
2539 
2540  LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2541  << getBlockName(*Iter)
2542  << " to the top: " << Cost.getFrequency() << "\n");
2543 
2544  if (Cost < SmallestRotationCost) {
2545  SmallestRotationCost = Cost;
2546  RotationPos = Iter;
2547  }
2548  }
2549 
2550  if (RotationPos != LoopChain.end()) {
2551  LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2552  << " to the top\n");
2553  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2554  }
2555 }
2556 
2557 /// Collect blocks in the given loop that are to be placed.
2558 ///
2559 /// When profile data is available, exclude cold blocks from the returned set;
2560 /// otherwise, collect all blocks in the loop.
2562 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2563  BlockFilterSet LoopBlockSet;
2564 
2565  // Filter cold blocks off from LoopBlockSet when profile data is available.
2566  // Collect the sum of frequencies of incoming edges to the loop header from
2567  // outside. If we treat the loop as a super block, this is the frequency of
2568  // the loop. Then for each block in the loop, we calculate the ratio between
2569  // its frequency and the frequency of the loop block. When it is too small,
2570  // don't add it to the loop chain. If there are outer loops, then this block
2571  // will be merged into the first outer loop chain for which this block is not
2572  // cold anymore. This needs precise profile data and we only do this when
2573  // profile data is available.
2574  if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2575  BlockFrequency LoopFreq(0);
2576  for (auto *LoopPred : L.getHeader()->predecessors())
2577  if (!L.contains(LoopPred))
2578  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2579  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2580 
2581  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2582  if (LoopBlockSet.count(LoopBB))
2583  continue;
2584  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2585  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2586  continue;
2587  BlockChain *Chain = BlockToChain[LoopBB];
2588  for (MachineBasicBlock *ChainBB : *Chain)
2589  LoopBlockSet.insert(ChainBB);
2590  }
2591  } else
2592  LoopBlockSet.insert(L.block_begin(), L.block_end());
2593 
2594  return LoopBlockSet;
2595 }
2596 
2597 /// Forms basic block chains from the natural loop structures.
2598 ///
2599 /// These chains are designed to preserve the existing *structure* of the code
2600 /// as much as possible. We can then stitch the chains together in a way which
2601 /// both preserves the topological structure and minimizes taken conditional
2602 /// branches.
2603 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2604  // First recurse through any nested loops, building chains for those inner
2605  // loops.
2606  for (const MachineLoop *InnerLoop : L)
2607  buildLoopChains(*InnerLoop);
2608 
2609  assert(BlockWorkList.empty() &&
2610  "BlockWorkList not empty when starting to build loop chains.");
2611  assert(EHPadWorkList.empty() &&
2612  "EHPadWorkList not empty when starting to build loop chains.");
2613  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2614 
2615  // Check if we have profile data for this function. If yes, we will rotate
2616  // this loop by modeling costs more precisely which requires the profile data
2617  // for better layout.
2618  bool RotateLoopWithProfile =
2620  (PreciseRotationCost && F->getFunction().hasProfileData());
2621 
2622  // First check to see if there is an obviously preferable top block for the
2623  // loop. This will default to the header, but may end up as one of the
2624  // predecessors to the header if there is one which will result in strictly
2625  // fewer branches in the loop body.
2626  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2627 
2628  // If we selected just the header for the loop top, look for a potentially
2629  // profitable exit block in the event that rotating the loop can eliminate
2630  // branches by placing an exit edge at the bottom.
2631  //
2632  // Loops are processed innermost to uttermost, make sure we clear
2633  // PreferredLoopExit before processing a new loop.
2634  PreferredLoopExit = nullptr;
2635  BlockFrequency ExitFreq;
2636  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2637  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2638 
2639  BlockChain &LoopChain = *BlockToChain[LoopTop];
2640 
2641  // FIXME: This is a really lame way of walking the chains in the loop: we
2642  // walk the blocks, and use a set to prevent visiting a particular chain
2643  // twice.
2644  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2645  assert(LoopChain.UnscheduledPredecessors == 0 &&
2646  "LoopChain should not have unscheduled predecessors.");
2647  UpdatedPreds.insert(&LoopChain);
2648 
2649  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2650  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2651 
2652  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2653 
2654  if (RotateLoopWithProfile)
2655  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2656  else
2657  rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2658 
2659  LLVM_DEBUG({
2660  // Crash at the end so we get all of the debugging output first.
2661  bool BadLoop = false;
2662  if (LoopChain.UnscheduledPredecessors) {
2663  BadLoop = true;
2664  dbgs() << "Loop chain contains a block without its preds placed!\n"
2665  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2666  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2667  }
2668  for (MachineBasicBlock *ChainBB : LoopChain) {
2669  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2670  if (!LoopBlockSet.remove(ChainBB)) {
2671  // We don't mark the loop as bad here because there are real situations
2672  // where this can occur. For example, with an unanalyzable fallthrough
2673  // from a loop block to a non-loop block or vice versa.
2674  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2675  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2676  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2677  << " Bad block: " << getBlockName(ChainBB) << "\n";
2678  }
2679  }
2680 
2681  if (!LoopBlockSet.empty()) {
2682  BadLoop = true;
2683  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2684  dbgs() << "Loop contains blocks never placed into a chain!\n"
2685  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2686  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2687  << " Bad block: " << getBlockName(LoopBB) << "\n";
2688  }
2689  assert(!BadLoop && "Detected problems with the placement of this loop.");
2690  });
2691 
2692  BlockWorkList.clear();
2693  EHPadWorkList.clear();
2694 }
2695 
2696 void MachineBlockPlacement::buildCFGChains() {
2697  // Ensure that every BB in the function has an associated chain to simplify
2698  // the assumptions of the remaining algorithm.
2699  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2700  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2701  ++FI) {
2702  MachineBasicBlock *BB = &*FI;
2703  BlockChain *Chain =
2704  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2705  // Also, merge any blocks which we cannot reason about and must preserve
2706  // the exact fallthrough behavior for.
2707  while (true) {
2708  Cond.clear();
2709  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2710  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2711  break;
2712 
2713  MachineFunction::iterator NextFI = std::next(FI);
2714  MachineBasicBlock *NextBB = &*NextFI;
2715  // Ensure that the layout successor is a viable block, as we know that
2716  // fallthrough is a possibility.
2717  assert(NextFI != FE && "Can't fallthrough past the last block.");
2718  LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2719  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2720  << "\n");
2721  Chain->merge(NextBB, nullptr);
2722 #ifndef NDEBUG
2723  BlocksWithUnanalyzableExits.insert(&*BB);
2724 #endif
2725  FI = NextFI;
2726  BB = NextBB;
2727  }
2728  }
2729 
2730  // Build any loop-based chains.
2731  PreferredLoopExit = nullptr;
2732  for (MachineLoop *L : *MLI)
2733  buildLoopChains(*L);
2734 
2735  assert(BlockWorkList.empty() &&
2736  "BlockWorkList should be empty before building final chain.");
2737  assert(EHPadWorkList.empty() &&
2738  "EHPadWorkList should be empty before building final chain.");
2739 
2740  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2741  for (MachineBasicBlock &MBB : *F)
2742  fillWorkLists(&MBB, UpdatedPreds);
2743 
2744  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2745  buildChain(&F->front(), FunctionChain);
2746 
2747 #ifndef NDEBUG
2748  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2749 #endif
2750  LLVM_DEBUG({
2751  // Crash at the end so we get all of the debugging output first.
2752  bool BadFunc = false;
2753  FunctionBlockSetType FunctionBlockSet;
2754  for (MachineBasicBlock &MBB : *F)
2755  FunctionBlockSet.insert(&MBB);
2756 
2757  for (MachineBasicBlock *ChainBB : FunctionChain)
2758  if (!FunctionBlockSet.erase(ChainBB)) {
2759  BadFunc = true;
2760  dbgs() << "Function chain contains a block not in the function!\n"
2761  << " Bad block: " << getBlockName(ChainBB) << "\n";
2762  }
2763 
2764  if (!FunctionBlockSet.empty()) {
2765  BadFunc = true;
2766  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2767  dbgs() << "Function contains blocks never placed into a chain!\n"
2768  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2769  }
2770  assert(!BadFunc && "Detected problems with the block placement.");
2771  });
2772 
2773  // Remember original layout ordering, so we can update terminators after
2774  // reordering to point to the original layout successor.
2775  SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2776  F->getNumBlockIDs());
2777  {
2778  MachineBasicBlock *LastMBB = nullptr;
2779  for (auto &MBB : *F) {
2780  if (LastMBB != nullptr)
2781  OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2782  LastMBB = &MBB;
2783  }
2784  OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2785  }
2786 
2787  // Splice the blocks into place.
2788  MachineFunction::iterator InsertPos = F->begin();
2789  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2790  for (MachineBasicBlock *ChainBB : FunctionChain) {
2791  LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2792  : " ... ")
2793  << getBlockName(ChainBB) << "\n");
2794  if (InsertPos != MachineFunction::iterator(ChainBB))
2795  F->splice(InsertPos, ChainBB);
2796  else
2797  ++InsertPos;
2798 
2799  // Update the terminator of the previous block.
2800  if (ChainBB == *FunctionChain.begin())
2801  continue;
2802  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2803 
2804  // FIXME: It would be awesome of updateTerminator would just return rather
2805  // than assert when the branch cannot be analyzed in order to remove this
2806  // boiler plate.
2807  Cond.clear();
2808  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2809 
2810 #ifndef NDEBUG
2811  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2812  // Given the exact block placement we chose, we may actually not _need_ to
2813  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2814  // do that at this point is a bug.
2815  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2816  !PrevBB->canFallThrough()) &&
2817  "Unexpected block with un-analyzable fallthrough!");
2818  Cond.clear();
2819  TBB = FBB = nullptr;
2820  }
2821 #endif
2822 
2823  // The "PrevBB" is not yet updated to reflect current code layout, so,
2824  // o. it may fall-through to a block without explicit "goto" instruction
2825  // before layout, and no longer fall-through it after layout; or
2826  // o. just opposite.
2827  //
2828  // analyzeBranch() may return erroneous value for FBB when these two
2829  // situations take place. For the first scenario FBB is mistakenly set NULL;
2830  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2831  // mistakenly pointing to "*BI".
2832  // Thus, if the future change needs to use FBB before the layout is set, it
2833  // has to correct FBB first by using the code similar to the following:
2834  //
2835  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2836  // PrevBB->updateTerminator();
2837  // Cond.clear();
2838  // TBB = FBB = nullptr;
2839  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2840  // // FIXME: This should never take place.
2841  // TBB = FBB = nullptr;
2842  // }
2843  // }
2844  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2845  PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2846  }
2847  }
2848 
2849  // Fixup the last block.
2850  Cond.clear();
2851  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2852  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2853  MachineBasicBlock *PrevBB = &F->back();
2854  PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2855  }
2856 
2857  BlockWorkList.clear();
2858  EHPadWorkList.clear();
2859 }
2860 
2861 void MachineBlockPlacement::optimizeBranches() {
2862  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2863  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2864 
2865  // Now that all the basic blocks in the chain have the proper layout,
2866  // make a final call to analyzeBranch with AllowModify set.
2867  // Indeed, the target may be able to optimize the branches in a way we
2868  // cannot because all branches may not be analyzable.
2869  // E.g., the target may be able to remove an unconditional branch to
2870  // a fallthrough when it occurs after predicated terminators.
2871  for (MachineBasicBlock *ChainBB : FunctionChain) {
2872  Cond.clear();
2873  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2874  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2875  // If PrevBB has a two-way branch, try to re-order the branches
2876  // such that we branch to the successor with higher probability first.
2877  if (TBB && !Cond.empty() && FBB &&
2878  MBPI->getEdgeProbability(ChainBB, FBB) >
2879  MBPI->getEdgeProbability(ChainBB, TBB) &&
2881  LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2882  << getBlockName(ChainBB) << "\n");
2883  LLVM_DEBUG(dbgs() << " Edge probability: "
2884  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2885  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2886  DebugLoc dl; // FIXME: this is nowhere
2887  TII->removeBranch(*ChainBB);
2888  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2889  }
2890  }
2891  }
2892 }
2893 
2894 void MachineBlockPlacement::alignBlocks() {
2895  // Walk through the backedges of the function now that we have fully laid out
2896  // the basic blocks and align the destination of each backedge. We don't rely
2897  // exclusively on the loop info here so that we can align backedges in
2898  // unnatural CFGs and backedges that were introduced purely because of the
2899  // loop rotations done during this layout pass.
2900  if (F->getFunction().hasMinSize() ||
2901  (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2902  return;
2903  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2904  if (FunctionChain.begin() == FunctionChain.end())
2905  return; // Empty chain.
2906 
2907  const BranchProbability ColdProb(1, 5); // 20%
2908  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2909  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2910  for (MachineBasicBlock *ChainBB : FunctionChain) {
2911  if (ChainBB == *FunctionChain.begin())
2912  continue;
2913 
2914  // Don't align non-looping basic blocks. These are unlikely to execute
2915  // enough times to matter in practice. Note that we'll still handle
2916  // unnatural CFGs inside of a natural outer loop (the common case) and
2917  // rotated loops.
2918  MachineLoop *L = MLI->getLoopFor(ChainBB);
2919  if (!L)
2920  continue;
2921 
2922  const Align Align = TLI->getPrefLoopAlignment(L);
2923  if (Align == 1)
2924  continue; // Don't care about loop alignment.
2925 
2926  // If the block is cold relative to the function entry don't waste space
2927  // aligning it.
2928  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2929  if (Freq < WeightedEntryFreq)
2930  continue;
2931 
2932  // If the block is cold relative to its loop header, don't align it
2933  // regardless of what edges into the block exist.
2934  MachineBasicBlock *LoopHeader = L->getHeader();
2935  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2936  if (Freq < (LoopHeaderFreq * ColdProb))
2937  continue;
2938 
2939  // If the global profiles indicates so, don't align it.
2940  if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
2941  !TLI->alignLoopsWithOptSize())
2942  continue;
2943 
2944  // Check for the existence of a non-layout predecessor which would benefit
2945  // from aligning this block.
2946  MachineBasicBlock *LayoutPred =
2947  &*std::prev(MachineFunction::iterator(ChainBB));
2948 
2949  auto DetermineMaxAlignmentPadding = [&]() {
2950  // Set the maximum bytes allowed to be emitted for alignment.
2951  unsigned MaxBytes;
2952  if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
2953  MaxBytes = MaxBytesForAlignmentOverride;
2954  else
2955  MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
2956  ChainBB->setMaxBytesForAlignment(MaxBytes);
2957  };
2958 
2959  // Force alignment if all the predecessors are jumps. We already checked
2960  // that the block isn't cold above.
2961  if (!LayoutPred->isSuccessor(ChainBB)) {
2962  ChainBB->setAlignment(Align);
2963  DetermineMaxAlignmentPadding();
2964  continue;
2965  }
2966 
2967  // Align this block if the layout predecessor's edge into this block is
2968  // cold relative to the block. When this is true, other predecessors make up
2969  // all of the hot entries into the block and thus alignment is likely to be
2970  // important.
2971  BranchProbability LayoutProb =
2972  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2973  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2974  if (LayoutEdgeFreq <= (Freq * ColdProb)) {
2975  ChainBB->setAlignment(Align);
2976  DetermineMaxAlignmentPadding();
2977  }
2978  }
2979 }
2980 
2981 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2982 /// it was duplicated into its chain predecessor and removed.
2983 /// \p BB - Basic block that may be duplicated.
2984 ///
2985 /// \p LPred - Chosen layout predecessor of \p BB.
2986 /// Updated to be the chain end if LPred is removed.
2987 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2988 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2989 /// Used to identify which blocks to update predecessor
2990 /// counts.
2991 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2992 /// chosen in the given order due to unnatural CFG
2993 /// only needed if \p BB is removed and
2994 /// \p PrevUnplacedBlockIt pointed to \p BB.
2995 /// @return true if \p BB was removed.
2996 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2998  const MachineBasicBlock *LoopHeaderBB,
2999  BlockChain &Chain, BlockFilterSet *BlockFilter,
3000  MachineFunction::iterator &PrevUnplacedBlockIt) {
3001  bool Removed, DuplicatedToLPred;
3002  bool DuplicatedToOriginalLPred;
3003  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
3004  PrevUnplacedBlockIt,
3005  DuplicatedToLPred);
3006  if (!Removed)
3007  return false;
3008  DuplicatedToOriginalLPred = DuplicatedToLPred;
3009  // Iteratively try to duplicate again. It can happen that a block that is
3010  // duplicated into is still small enough to be duplicated again.
3011  // No need to call markBlockSuccessors in this case, as the blocks being
3012  // duplicated from here on are already scheduled.
3013  while (DuplicatedToLPred && Removed) {
3014  MachineBasicBlock *DupBB, *DupPred;
3015  // The removal callback causes Chain.end() to be updated when a block is
3016  // removed. On the first pass through the loop, the chain end should be the
3017  // same as it was on function entry. On subsequent passes, because we are
3018  // duplicating the block at the end of the chain, if it is removed the
3019  // chain will have shrunk by one block.
3020  BlockChain::iterator ChainEnd = Chain.end();
3021  DupBB = *(--ChainEnd);
3022  // Now try to duplicate again.
3023  if (ChainEnd == Chain.begin())
3024  break;
3025  DupPred = *std::prev(ChainEnd);
3026  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
3027  PrevUnplacedBlockIt,
3028  DuplicatedToLPred);
3029  }
3030  // If BB was duplicated into LPred, it is now scheduled. But because it was
3031  // removed, markChainSuccessors won't be called for its chain. Instead we
3032  // call markBlockSuccessors for LPred to achieve the same effect. This must go
3033  // at the end because repeating the tail duplication can increase the number
3034  // of unscheduled predecessors.
3035  LPred = *std::prev(Chain.end());
3036  if (DuplicatedToOriginalLPred)
3037  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3038  return true;
3039 }
3040 
3041 /// Tail duplicate \p BB into (some) predecessors if profitable.
3042 /// \p BB - Basic block that may be duplicated
3043 /// \p LPred - Chosen layout predecessor of \p BB
3044 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3045 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3046 /// Used to identify which blocks to update predecessor
3047 /// counts.
3048 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3049 /// chosen in the given order due to unnatural CFG
3050 /// only needed if \p BB is removed and
3051 /// \p PrevUnplacedBlockIt pointed to \p BB.
3052 /// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3053 /// \return - True if the block was duplicated into all preds and removed.
3054 bool MachineBlockPlacement::maybeTailDuplicateBlock(
3056  BlockChain &Chain, BlockFilterSet *BlockFilter,
3057  MachineFunction::iterator &PrevUnplacedBlockIt,
3058  bool &DuplicatedToLPred) {
3059  DuplicatedToLPred = false;
3060  if (!shouldTailDuplicate(BB))
3061  return false;
3062 
3063  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3064  << "\n");
3065 
3066  // This has to be a callback because none of it can be done after
3067  // BB is deleted.
3068  bool Removed = false;
3069  auto RemovalCallback =
3070  [&](MachineBasicBlock *RemBB) {
3071  // Signal to outer function
3072  Removed = true;
3073 
3074  // Conservative default.
3075  bool InWorkList = true;
3076  // Remove from the Chain and Chain Map
3077  if (BlockToChain.count(RemBB)) {
3078  BlockChain *Chain = BlockToChain[RemBB];
3079  InWorkList = Chain->UnscheduledPredecessors == 0;
3080  Chain->remove(RemBB);
3081  BlockToChain.erase(RemBB);
3082  }
3083 
3084  // Handle the unplaced block iterator
3085  if (&(*PrevUnplacedBlockIt) == RemBB) {
3086  PrevUnplacedBlockIt++;
3087  }
3088 
3089  // Handle the Work Lists
3090  if (InWorkList) {
3091  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
3092  if (RemBB->isEHPad())
3093  RemoveList = EHPadWorkList;
3094  llvm::erase_value(RemoveList, RemBB);
3095  }
3096 
3097  // Handle the filter set
3098  if (BlockFilter) {
3099  BlockFilter->remove(RemBB);
3100  }
3101 
3102  // Remove the block from loop info.
3103  MLI->removeBlock(RemBB);
3104  if (RemBB == PreferredLoopExit)
3105  PreferredLoopExit = nullptr;
3106 
3107  LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
3108  << getBlockName(RemBB) << "\n");
3109  };
3110  auto RemovalCallbackRef =
3111  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
3112 
3113  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
3114  bool IsSimple = TailDup.isSimpleBB(BB);
3115  SmallVector<MachineBasicBlock *, 8> CandidatePreds;
3116  SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3117  if (F->getFunction().hasProfileData()) {
3118  // We can do partial duplication with precise profile information.
3119  findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3120  if (CandidatePreds.size() == 0)
3121  return false;
3122  if (CandidatePreds.size() < BB->pred_size())
3123  CandidatePtr = &CandidatePreds;
3124  }
3125  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3126  &RemovalCallbackRef, CandidatePtr);
3127 
3128  // Update UnscheduledPredecessors to reflect tail-duplication.
3129  DuplicatedToLPred = false;
3130  for (MachineBasicBlock *Pred : DuplicatedPreds) {
3131  // We're only looking for unscheduled predecessors that match the filter.
3132  BlockChain* PredChain = BlockToChain[Pred];
3133  if (Pred == LPred)
3134  DuplicatedToLPred = true;
3135  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
3136  || PredChain == &Chain)
3137  continue;
3138  for (MachineBasicBlock *NewSucc : Pred->successors()) {
3139  if (BlockFilter && !BlockFilter->count(NewSucc))
3140  continue;
3141  BlockChain *NewChain = BlockToChain[NewSucc];
3142  if (NewChain != &Chain && NewChain != PredChain)
3143  NewChain->UnscheduledPredecessors++;
3144  }
3145  }
3146  return Removed;
3147 }
3148 
3149 // Count the number of actual machine instructions.
3151  uint64_t InstrCount = 0;
3152  for (MachineInstr &MI : *MBB) {
3153  if (!MI.isPHI() && !MI.isMetaInstruction())
3154  InstrCount += 1;
3155  }
3156  return InstrCount;
3157 }
3158 
3159 // The size cost of duplication is the instruction size of the duplicated block.
3160 // So we should scale the threshold accordingly. But the instruction size is not
3161 // available on all targets, so we use the number of instructions instead.
3162 BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3163  return DupThreshold.getFrequency() * countMBBInstruction(BB);
3164 }
3165 
3166 // Returns true if BB is Pred's best successor.
3167 bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3168  MachineBasicBlock *Pred,
3169  BlockFilterSet *BlockFilter) {
3170  if (BB == Pred)
3171  return false;
3172  if (BlockFilter && !BlockFilter->count(Pred))
3173  return false;
3174  BlockChain *PredChain = BlockToChain[Pred];
3175  if (PredChain && (Pred != *std::prev(PredChain->end())))
3176  return false;
3177 
3178  // Find the successor with largest probability excluding BB.
3180  for (MachineBasicBlock *Succ : Pred->successors())
3181  if (Succ != BB) {
3182  if (BlockFilter && !BlockFilter->count(Succ))
3183  continue;
3184  BlockChain *SuccChain = BlockToChain[Succ];
3185  if (SuccChain && (Succ != *SuccChain->begin()))
3186  continue;
3187  BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3188  if (SuccProb > BestProb)
3189  BestProb = SuccProb;
3190  }
3191 
3192  BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3193  if (BBProb <= BestProb)
3194  return false;
3195 
3196  // Compute the number of reduced taken branches if Pred falls through to BB
3197  // instead of another successor. Then compare it with threshold.
3198  BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3199  BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3200  return Gain > scaleThreshold(BB);
3201 }
3202 
3203 // Find out the predecessors of BB and BB can be beneficially duplicated into
3204 // them.
3205 void MachineBlockPlacement::findDuplicateCandidates(
3208  BlockFilterSet *BlockFilter) {
3209  MachineBasicBlock *Fallthrough = nullptr;
3210  BranchProbability DefaultBranchProb = BranchProbability::getZero();
3211  BlockFrequency BBDupThreshold(scaleThreshold(BB));
3212  SmallVector<MachineBasicBlock *, 8> Preds(BB->predecessors());
3213  SmallVector<MachineBasicBlock *, 8> Succs(BB->successors());
3214 
3215  // Sort for highest frequency.
3216  auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3217  return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3218  };
3219  auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3220  return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3221  };
3222  llvm::stable_sort(Succs, CmpSucc);
3223  llvm::stable_sort(Preds, CmpPred);
3224 
3225  auto SuccIt = Succs.begin();
3226  if (SuccIt != Succs.end()) {
3227  DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3228  }
3229 
3230  // For each predecessors of BB, compute the benefit of duplicating BB,
3231  // if it is larger than the threshold, add it into Candidates.
3232  //
3233  // If we have following control flow.
3234  //
3235  // PB1 PB2 PB3 PB4
3236  // \ | / /\
3237  // \ | / / \
3238  // \ |/ / \
3239  // BB----/ OB
3240  // /\
3241  // / \
3242  // SB1 SB2
3243  //
3244  // And it can be partially duplicated as
3245  //
3246  // PB2+BB
3247  // | PB1 PB3 PB4
3248  // | | / /\
3249  // | | / / \
3250  // | |/ / \
3251  // | BB----/ OB
3252  // |\ /|
3253  // | X |
3254  // |/ \|
3255  // SB2 SB1
3256  //
3257  // The benefit of duplicating into a predecessor is defined as
3258  // Orig_taken_branch - Duplicated_taken_branch
3259  //
3260  // The Orig_taken_branch is computed with the assumption that predecessor
3261  // jumps to BB and the most possible successor is laid out after BB.
3262  //
3263  // The Duplicated_taken_branch is computed with the assumption that BB is
3264  // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3265  // SB2 for PB2 in our case). If there is no available successor, the combined
3266  // block jumps to all BB's successor, like PB3 in this example.
3267  //
3268  // If a predecessor has multiple successors, so BB can't be duplicated into
3269  // it. But it can beneficially fall through to BB, and duplicate BB into other
3270  // predecessors.
3271  for (MachineBasicBlock *Pred : Preds) {
3272  BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3273 
3274  if (!TailDup.canTailDuplicate(BB, Pred)) {
3275  // BB can't be duplicated into Pred, but it is possible to be layout
3276  // below Pred.
3277  if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3278  Fallthrough = Pred;
3279  if (SuccIt != Succs.end())
3280  SuccIt++;
3281  }
3282  continue;
3283  }
3284 
3285  BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3286  BlockFrequency DupCost;
3287  if (SuccIt == Succs.end()) {
3288  // Jump to all successors;
3289  if (Succs.size() > 0)
3290  DupCost += PredFreq;
3291  } else {
3292  // Fallthrough to *SuccIt, jump to all other successors;
3293  DupCost += PredFreq;
3294  DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3295  }
3296 
3297  assert(OrigCost >= DupCost);
3298  OrigCost -= DupCost;
3299  if (OrigCost > BBDupThreshold) {
3300  Candidates.push_back(Pred);
3301  if (SuccIt != Succs.end())
3302  SuccIt++;
3303  }
3304  }
3305 
3306  // No predecessors can optimally fallthrough to BB.
3307  // So we can change one duplication into fallthrough.
3308  if (!Fallthrough) {
3309  if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3310  Candidates[0] = Candidates.back();
3311  Candidates.pop_back();
3312  }
3313  }
3314 }
3315 
3316 void MachineBlockPlacement::initDupThreshold() {
3317  DupThreshold = 0;
3318  if (!F->getFunction().hasProfileData())
3319  return;
3320 
3321  // We prefer to use prifile count.
3322  uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3323  if (HotThreshold != UINT64_MAX) {
3324  UseProfileCount = true;
3325  DupThreshold = HotThreshold * TailDupProfilePercentThreshold / 100;
3326  return;
3327  }
3328 
3329  // Profile count is not available, we can use block frequency instead.
3330  BlockFrequency MaxFreq = 0;
3331  for (MachineBasicBlock &MBB : *F) {
3332  BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3333  if (Freq > MaxFreq)
3334  MaxFreq = Freq;
3335  }
3336 
3337  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3338  DupThreshold = MaxFreq * ThresholdProb;
3339  UseProfileCount = false;
3340 }
3341 
3342 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3343  if (skipFunction(MF.getFunction()))
3344  return false;
3345 
3346  // Check for single-block functions and skip them.
3347  if (std::next(MF.begin()) == MF.end())
3348  return false;
3349 
3350  F = &MF;
3351  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3352  MBFI = std::make_unique<MBFIWrapper>(
3353  getAnalysis<MachineBlockFrequencyInfo>());
3354  MLI = &getAnalysis<MachineLoopInfo>();
3355  TII = MF.getSubtarget().getInstrInfo();
3356  TLI = MF.getSubtarget().getTargetLowering();
3357  MPDT = nullptr;
3358  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
3359 
3360  initDupThreshold();
3361 
3362  // Initialize PreferredLoopExit to nullptr here since it may never be set if
3363  // there are no MachineLoops.
3364  PreferredLoopExit = nullptr;
3365 
3366  assert(BlockToChain.empty() &&
3367  "BlockToChain map should be empty before starting placement.");
3368  assert(ComputedEdges.empty() &&
3369  "Computed Edge map should be empty before starting placement.");
3370 
3371  unsigned TailDupSize = TailDupPlacementThreshold;
3372  // If only the aggressive threshold is explicitly set, use it.
3373  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3374  TailDupPlacementThreshold.getNumOccurrences() == 0)
3376 
3377  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
3378  // For aggressive optimization, we can adjust some thresholds to be less
3379  // conservative.
3380  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
3381  // At O3 we should be more willing to copy blocks for tail duplication. This
3382  // increases size pressure, so we only do it at O3
3383  // Do this unless only the regular threshold is explicitly set.
3384  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3385  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3387  }
3388 
3389  // If there's no threshold provided through options, query the target
3390  // information for a threshold instead.
3391  if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3392  (PassConfig->getOptLevel() < CodeGenOpt::Aggressive ||
3393  TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3394  TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
3395 
3396  if (allowTailDupPlacement()) {
3397  MPDT = &getAnalysis<MachinePostDominatorTree>();
3398  bool OptForSize = MF.getFunction().hasOptSize() ||
3399  llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3400  if (OptForSize)
3401  TailDupSize = 1;
3402  bool PreRegAlloc = false;
3403  TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3404  /* LayoutMode */ true, TailDupSize);
3405  precomputeTriangleChains();
3406  }
3407 
3408  buildCFGChains();
3409 
3410  // Changing the layout can create new tail merging opportunities.
3411  // TailMerge can create jump into if branches that make CFG irreducible for
3412  // HW that requires structured CFG.
3413  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3414  PassConfig->getEnableTailMerge() &&
3416  // No tail merging opportunities if the block number is less than four.
3417  if (MF.size() > 3 && EnableTailMerge) {
3418  unsigned TailMergeSize = TailDupSize + 1;
3419  BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3420  *MBFI, *MBPI, PSI, TailMergeSize);
3421 
3422  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3423  /*AfterPlacement=*/true)) {
3424  // Redo the layout if tail merging creates/removes/moves blocks.
3425  BlockToChain.clear();
3426  ComputedEdges.clear();
3427  // Must redo the post-dominator tree if blocks were changed.
3428  if (MPDT)
3429  MPDT->runOnMachineFunction(MF);
3430  ChainAllocator.DestroyAll();
3431  buildCFGChains();
3432  }
3433  }
3434 
3435  // Apply a post-processing optimizing block placement.
3436  if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
3438  // Find a new placement and modify the layout of the blocks in the function.
3439  applyExtTsp();
3440 
3441  // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3442  createCFGChainExtTsp();
3443  }
3444 
3445  optimizeBranches();
3446  alignBlocks();
3447 
3448  BlockToChain.clear();
3449  ComputedEdges.clear();
3450  ChainAllocator.DestroyAll();
3451 
3452  bool HasMaxBytesOverride =
3453  MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3454 
3455  if (AlignAllBlock)
3456  // Align all of the blocks in the function to a specific alignment.
3457  for (MachineBasicBlock &MBB : MF) {
3458  if (HasMaxBytesOverride)
3461  else
3462  MBB.setAlignment(Align(1ULL << AlignAllBlock));
3463  }
3464  else if (AlignAllNonFallThruBlocks) {
3465  // Align all of the blocks that have no fall-through predecessors to a
3466  // specific alignment.
3467  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
3468  auto LayoutPred = std::prev(MBI);
3469  if (!LayoutPred->isSuccessor(&*MBI)) {
3470  if (HasMaxBytesOverride)
3471  MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3473  else
3474  MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3475  }
3476  }
3477  }
3479  (ViewBlockFreqFuncName.empty() ||
3480  F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
3482  MF.RenumberBlocks();
3483  MBFI->view("MBP." + MF.getName(), false);
3484  }
3485 
3486  // We always return true as we have no way to track whether the final order
3487  // differs from the original order.
3488  return true;
3489 }
3490 
3491 void MachineBlockPlacement::applyExtTsp() {
3492  // Prepare data; blocks are indexed by their index in the current ordering.
3494  BlockIndex.reserve(F->size());
3495  std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3496  CurrentBlockOrder.reserve(F->size());
3497  size_t NumBlocks = 0;
3498  for (const MachineBasicBlock &MBB : *F) {
3499  BlockIndex[&MBB] = NumBlocks++;
3500  CurrentBlockOrder.push_back(&MBB);
3501  }
3502 
3503  auto BlockSizes = std::vector<uint64_t>(F->size());
3504  auto BlockCounts = std::vector<uint64_t>(F->size());
3505  std::vector<EdgeCountT> JumpCounts;
3506  for (MachineBasicBlock &MBB : *F) {
3507  // Getting the block frequency.
3508  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3509  BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
3510  // Getting the block size:
3511  // - approximate the size of an instruction by 4 bytes, and
3512  // - ignore debug instructions.
3513  // Note: getting the exact size of each block is target-dependent and can be
3514  // done by extending the interface of MCCodeEmitter. Experimentally we do
3515  // not see a perf improvement with the exact block sizes.
3516  auto NonDbgInsts =
3518  int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3519  BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3520  // Getting jump frequencies.
3521  for (MachineBasicBlock *Succ : MBB.successors()) {
3522  auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3523  BlockFrequency JumpFreq = BlockFreq * EP;
3524  auto Jump = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]);
3525  JumpCounts.push_back(std::make_pair(Jump, JumpFreq.getFrequency()));
3526  }
3527  }
3528 
3529  LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3530  << " with profile = " << F->getFunction().hasProfileData()
3531  << " (" << F->getName().str() << ")"
3532  << "\n");
3533  LLVM_DEBUG(
3534  dbgs() << format(" original layout score: %0.2f\n",
3535  calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
3536 
3537  // Run the layout algorithm.
3538  auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3539  std::vector<const MachineBasicBlock *> NewBlockOrder;
3540  NewBlockOrder.reserve(F->size());
3541  for (uint64_t Node : NewOrder) {
3542  NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3543  }
3544  LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n",
3545  calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
3546  JumpCounts)));
3547 
3548  // Assign new block order.
3549  assignBlockOrder(NewBlockOrder);
3550 }
3551 
3552 void MachineBlockPlacement::assignBlockOrder(
3553  const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3554  assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3555  F->RenumberBlocks();
3556 
3557  bool HasChanges = false;
3558  for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3559  if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3560  HasChanges = true;
3561  break;
3562  }
3563  }
3564  // Stop early if the new block order is identical to the existing one.
3565  if (!HasChanges)
3566  return;
3567 
3568  SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3569  for (auto &MBB : *F) {
3570  PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3571  }
3572 
3573  // Sort basic blocks in the function according to the computed order.
3575  for (const MachineBasicBlock *MBB : NewBlockOrder) {
3576  NewIndex[MBB] = NewIndex.size();
3577  }
3578  F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3579  return NewIndex[&L] < NewIndex[&R];
3580  });
3581 
3582  // Update basic block branches by inserting explicit fallthrough branches
3583  // when required and re-optimize branches when possible.
3584  const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3586  for (auto &MBB : *F) {
3587  MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3589  auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3590  // If this block had a fallthrough before we need an explicit unconditional
3591  // branch to that block if the fallthrough block is not adjacent to the
3592  // block in the new order.
3593  if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3594  TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3595  }
3596 
3597  // It might be possible to optimize branches by flipping the condition.
3598  Cond.clear();
3599  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3600  if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3601  continue;
3602  MBB.updateTerminator(FTMBB);
3603  }
3604 
3605 #ifndef NDEBUG
3606  // Make sure we correctly constructed all branches.
3607  F->verify(this, "After optimized block reordering");
3608 #endif
3609 }
3610 
3611 void MachineBlockPlacement::createCFGChainExtTsp() {
3612  BlockToChain.clear();
3613  ComputedEdges.clear();
3614  ChainAllocator.DestroyAll();
3615 
3616  MachineBasicBlock *HeadBB = &F->front();
3617  BlockChain *FunctionChain =
3618  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3619 
3620  for (MachineBasicBlock &MBB : *F) {
3621  if (HeadBB == &MBB)
3622  continue; // Ignore head of the chain
3623  FunctionChain->merge(&MBB, nullptr);
3624  }
3625 }
3626 
3627 namespace {
3628 
3629 /// A pass to compute block placement statistics.
3630 ///
3631 /// A separate pass to compute interesting statistics for evaluating block
3632 /// placement. This is separate from the actual placement pass so that they can
3633 /// be computed in the absence of any placement transformations or when using
3634 /// alternative placement strategies.
3635 class MachineBlockPlacementStats : public MachineFunctionPass {
3636  /// A handle to the branch probability pass.
3637  const MachineBranchProbabilityInfo *MBPI;
3638 
3639  /// A handle to the function-wide block frequency pass.
3640  const MachineBlockFrequencyInfo *MBFI;
3641 
3642 public:
3643  static char ID; // Pass identification, replacement for typeid
3644 
3645  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3647  }
3648 
3649  bool runOnMachineFunction(MachineFunction &F) override;
3650 
3651  void getAnalysisUsage(AnalysisUsage &AU) const override {
3654  AU.setPreservesAll();
3656  }
3657 };
3658 
3659 } // end anonymous namespace
3660 
3662 
3664 
3665 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3666  "Basic Block Placement Stats", false, false)
3669 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3670  "Basic Block Placement Stats", false, false)
3671 
3672 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3673  // Check for single-block functions and skip them.
3674  if (std::next(F.begin()) == F.end())
3675  return false;
3676 
3677  if (!isFunctionInPrintList(F.getName()))
3678  return false;
3679 
3680  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3681  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
3682 
3683  for (MachineBasicBlock &MBB : F) {
3684  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3685  Statistic &NumBranches =
3686  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3687  Statistic &BranchTakenFreq =
3688  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3689  for (MachineBasicBlock *Succ : MBB.successors()) {
3690  // Skip if this successor is a fallthrough.
3691  if (MBB.isLayoutSuccessor(Succ))
3692  continue;
3693 
3694  BlockFrequency EdgeFreq =
3695  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3696  ++NumBranches;
3697  BranchTakenFreq += EdgeFreq.getFrequency();
3698  }
3699  }
3700 
3701  return false;
3702 }
llvm::TargetMachine::requiresStructuredCFG
bool requiresStructuredCFG() const
Definition: TargetMachine.h:215
llvm::BranchFolder
Definition: BranchFolding.h:31
i
i
Definition: README.txt:29
block
we get the following basic block
Definition: README_ALTIVEC.txt:95
llvm::MachineBasicBlock::succ_size
unsigned succ_size() const
Definition: MachineBasicBlock.h:381
llvm::MachineBasicBlock::pred_begin
pred_iterator pred_begin()
Definition: MachineBasicBlock.h:353
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:108
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:492
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:741
llvm::TargetLoweringBase
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
Definition: TargetLowering.h:191
llvm::TailDuplicator::initMF
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
Definition: TailDuplicator.cpp:79
llvm::MachineLoopInfo::getLoopFor
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
Definition: MachineLoopInfo.h:126
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
llvm::HexagonInstrInfo::removeBranch
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
Definition: HexagonInstrInfo.cpp:603
RenumberBlocksBeforeView
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:197
llvm::BranchProbability::getNumerator
uint32_t getNumerator() const
Definition: BranchProbability.h:65
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
Pass.h
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:139
llvm::TargetSubtargetInfo::getInstrInfo
virtual const TargetInstrInfo * getInstrInfo() const
Definition: TargetSubtargetInfo.h:95
llvm::HexagonInstrInfo::analyzeBranch
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
Definition: HexagonInstrInfo.cpp:433
llvm::raw_string_ostream
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:629
getAdjustedProbability
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
Definition: MachineBlockPlacement.cpp:750
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
Statistic.h
llvm::MachineFunction::end
iterator end()
Definition: MachineFunction.h:856
llvm::ProfileSummaryInfo::getOrCompHotCountThreshold
uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
Definition: ProfileSummaryInfo.cpp:319
llvm::printMBBReference
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Definition: MachineBasicBlock.cpp:117
Allocator.h
llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:1997
llvm::MachineBasicBlock::isEntryBlock
bool isEntryBlock() const
Returns true if this is the entry block of the function.
Definition: MachineBasicBlock.cpp:287
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
MachineSizeOpts.h
llvm::initializeMachineBlockPlacementPass
void initializeMachineBlockPlacementPass(PassRegistry &)
MachineBasicBlock.h
llvm::TargetLoweringBase::getPrefLoopAlignment
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
Definition: TargetLoweringBase.cpp:2018
llvm::TargetSubtargetInfo::getRegisterInfo
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
Definition: TargetSubtargetInfo.h:127
EnableExtTspBlockPlacement
cl::opt< bool > EnableExtTspBlockPlacement
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::erase
bool erase(const KeyT &Val)
Definition: DenseMap.h:302
AlignAllBlock
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::begin
iterator begin()
Definition: DenseMap.h:75
InstrCount
static unsigned InstrCount
Definition: DFAPacketizer.cpp:53
llvm::SpecificBumpPtrAllocator
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:382
llvm::BlockFrequency::getMaxFrequency
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
Definition: BlockFrequency.h:30
BlockFrequency.h
DenseMap.h
llvm::BranchProbability::getZero
static BranchProbability getZero()
Definition: BranchProbability.h:49
TargetInstrInfo.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::TailDuplicator::shouldTailDuplicate
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
Definition: TailDuplicator.cpp:551
getLayoutSuccessorProbThreshold
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
Definition: MachineBlockPlacement.cpp:1379
MBFIWrapper.h
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877
STLExtras.h
llvm::MachineBasicBlock::back
MachineInstr & back()
Definition: MachineBasicBlock.h:285
llvm::applyExtTspLayout
std::vector< uint64_t > applyExtTspLayout(const std::vector< uint64_t > &NodeSizes, const std::vector< uint64_t > &NodeCounts, const std::vector< EdgeCountT > &EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
llvm::TargetLoweringBase::alignLoopsWithOptSize
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
Definition: TargetLowering.h:1870
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:202
llvm::MachineBlockFrequencyInfo::getBlockFreq
BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
Definition: MachineBlockFrequencyInfo.cpp:230
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:167
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:265
llvm::MachineLoopInfo
Definition: MachineLoopInfo.h:89
llvm::MachineBasicBlock::succ_rbegin
succ_reverse_iterator succ_rbegin()
Definition: MachineBasicBlock.h:373
llvm::LoopBase::block_end
block_iterator block_end() const
Definition: LoopInfo.h:194
llvm::MachineBasicBlock::dump
void dump() const
Definition: MachineBasicBlock.cpp:292
llvm::TailDuplicator::isSimpleBB
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
Definition: TailDuplicator.cpp:681
llvm::sys::path::append
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:456
TailDupPlacementThreshold
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CommandLine.h
llvm::MachineBasicBlock::pred_size
unsigned pred_size() const
Definition: MachineBasicBlock.h:365
TargetLowering.h
llvm::shouldOptimizeForSize
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
Definition: MachineSizeOpts.cpp:183
BlockFrequencyInfoImpl.h
llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition: TargetInstrInfo.h:98
MachineLoopInfo.h
TargetMachine.h
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:24
llvm::MutableArrayRef
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:28
llvm::MachineBlockFrequencyInfo
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
Definition: MachineBlockFrequencyInfo.h:33
llvm::MachineBranchProbabilityInfo
Definition: MachineBranchProbabilityInfo.h:22
InlinePriorityMode::Cost
@ Cost
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::MachineBasicBlock::isSuccessor
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
Definition: MachineBasicBlock.cpp:924
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
TBB
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
Definition: RISCVRedundantCopyElimination.cpp:76
llvm::TargetPassConfig::getEnableTailMerge
bool getEnableTailMerge() const
Definition: TargetPassConfig.h:176
UINT64_MAX
#define UINT64_MAX
Definition: DataTypes.h:77
llvm::TargetLoweringBase::getMaxPermittedBytesForAlignment
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
Definition: TargetLoweringBase.cpp:2024
false
Definition: StackSlotColoring.cpp:141
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition: LoopDeletion.cpp:53
llvm::MachineFunction::size
unsigned size() const
Definition: MachineFunction.h:864
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
llvm::StaticLikelyProb
cl::opt< unsigned > StaticLikelyProb
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
AlignAllNonFallThruBlocks
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
llvm::HexagonInstrInfo::insertBranch
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
Definition: HexagonInstrInfo.cpp:626
llvm::raw_ostream::flush
void flush()
Definition: raw_ostream.h:186
llvm::MachineFunction::begin
iterator begin()
Definition: MachineFunction.h:854
llvm::ProfileLikelyProb
cl::opt< unsigned > ProfileLikelyProb
DebugLoc.h
SmallPtrSet.h
JumpInstCost
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
BranchFolding.h
TailDupPlacement
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
Align
uint64_t Align
Definition: ELFObjHandler.cpp:82
CodeLayout.h
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
PrintPasses.h
llvm::calcExtTspScore
double calcExtTspScore(const std::vector< uint64_t > &Order, const std::vector< uint64_t > &NodeSizes, const std::vector< uint64_t > &NodeCounts, const std::vector< EdgeCountT > &EdgeCounts)
Estimate the "quality" of a given node order in CFG.
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::erase_value
void erase_value(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2005
BranchProbability.h
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:40
BranchFoldPlacement
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
llvm::MachineBasicBlock::setAlignment
void setAlignment(Align A)
Set alignment of the basic block.
Definition: MachineBasicBlock.h:559
llvm::LoopBase::block_begin
block_iterator block_begin() const
Definition: LoopInfo.h:193
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:36
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition: SmallPtrSet.h:92
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:657
llvm::cl::opt
Definition: CommandLine.h:1412
TriangleChainCount
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
TailDupProfilePercentThreshold
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
llvm::MachineLoop
Definition: MachineLoopInfo.h:44
llvm::BlockFrequency
Definition: BlockFrequency.h:23
TailMergeSize
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::clear
void clear()
Definition: DenseMap.h:110
llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1895
MaxBytesForAlignmentOverride
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
uint64_t
ProfileSummaryInfo.h
llvm::ViewBlockLayoutWithBFI
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
Definition: MachineBlockPlacement.cpp:226
llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1754
ExitBlockBias
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
llvm::DenseMap
Definition: DenseMap.h:714
llvm::TailDuplicator::tailDuplicateAndUpdate
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
Definition: TailDuplicator.cpp:155
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::BlockFrequency::getFrequency
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
Definition: BlockFrequency.h:34
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:447
greaterWithBias
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
Definition: MachineBlockPlacement.cpp:798
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:194
ArrayRef.h
llvm::isFunctionInPrintList
bool isFunctionInPrintList(StringRef FunctionName)
Definition: PrintPasses.cpp:151
TargetPassConfig.h
MachineFunctionPass.h
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
llvm::SmallVectorImpl::const_iterator
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:582
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineBasicBlock::canFallThrough
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
Definition: MachineBasicBlock.cpp:981
LoopToColdBlockRatio
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
MachineBranchProbabilityInfo.h
llvm::MachineBasicBlock::succ_begin
succ_iterator succ_begin()
Definition: MachineBasicBlock.h:369
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:261
hasSameSuccessors
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
Definition: MachineBlockPlacement.cpp:765
MachinePostDominators.h
llvm::MachineBasicBlock::predecessors
iterator_range< pred_iterator > predecessors()
Definition: MachineBasicBlock.h:386
llvm::MachineBranchProbabilityInfo::getEdgeProbability
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
Definition: MachineBranchProbabilityInfo.cpp:56
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::empty
bool empty() const
Definition: DenseMap.h:98
llvm::CodeGenOpt::Aggressive
@ Aggressive
Definition: CodeGen.h:56
llvm::MachineBasicBlock::instr_begin
instr_iterator instr_begin()
Definition: MachineBasicBlock.h:289
llvm::MachineBlockPlacementStatsID
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
Definition: MachineBlockPlacement.cpp:3663
MisfetchCost
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:291
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::BranchProbability::getOne
static BranchProbability getOne()
Definition: BranchProbability.h:50
llvm::MachineBasicBlock::succ_empty
bool succ_empty() const
Definition: MachineBasicBlock.h:384
llvm::sys::fs::remove
std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
llvm::MachineBasicBlock::getNumber
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
Definition: MachineBasicBlock.h:1115
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:138
llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition: MachineBasicBlock.h:392
llvm::MachineBasicBlock::isEHPad
bool isEHPad() const
Returns true if the block is a landing pad.
Definition: MachineBasicBlock.h:576
llvm::MachineBasicBlock::findBranchDebugLoc
DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
Definition: MachineBasicBlock.cpp:1427
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
uint32_t
llvm::BranchProbability
Definition: BranchProbability.h:30
Compiler.h
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
TargetSubtargetInfo.h
llvm::MachineBlockPlacementID
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
Definition: MachineBlockPlacement.cpp:622
PreciseRotationCost
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
llvm::format
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
llvm::TrackingStatistic
Definition: Statistic.h:50
for
this could be done in SelectionDAGISel along with other special for
Definition: README.txt:104
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
llvm::SpecificBumpPtrAllocator::Allocate
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:432
llvm::SmallPtrSetImplBase::size
size_type size() const
Definition: SmallPtrSet.h:93
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:645
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::MachinePostDominatorTree::dominates
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Definition: MachinePostDominators.h:54
getBlockName
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
Definition: MachineBlockPlacement.cpp:638
stats
Definition: MCAssembler.cpp:53
llvm::stable_sort
void stable_sort(R &&Range)
Definition: STLExtras.h:1947
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:623
Stats
block placement Basic Block Placement Stats
Definition: MachineBlockPlacement.cpp:3670
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
ForcePreciseRotationCost
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
llvm::MachineFunction::getTarget
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition: MachineFunction.h:653
rotate
The same transformation can work with an even modulo with the addition of a rotate
Definition: README.txt:680
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::end
iterator end()
Definition: DenseMap.h:84
llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::MCID::Branch
@ Branch
Definition: MCInstrDesc.h:158
Function.h
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:105
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::size
unsigned size() const
Definition: DenseMap.h:99
llvm::initializeMachineBlockPlacementStatsPass
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
llvm::Function::hasProfileData
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:290
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:614
llvm::MachineBasicBlock::isLayoutSuccessor
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
Definition: MachineBasicBlock.cpp:928
ForceLoopColdBlock
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
llvm::BranchProbability::getCompl
BranchProbability getCompl() const
Definition: BranchProbability.h:69
llvm::MachineBasicBlock::getFallThrough
MachineBasicBlock * getFallThrough()
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
Definition: MachineBasicBlock.cpp:937
llvm::MachinePostDominatorTree::runOnMachineFunction
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Definition: MachinePostDominators.cpp:40
CodeGen.h
llvm::TailDuplicator::canTailDuplicate
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
Definition: TailDuplicator.cpp:789
llvm::HexagonInstrInfo::reverseBranchCondition
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
Definition: HexagonInstrInfo.cpp:1626
llvm::TargetSubtargetInfo::getTargetLowering
virtual const TargetLowering * getTargetLowering() const
Definition: TargetSubtargetInfo.h:99
Placement
Branch Probability Basic Block Placement
Definition: MachineBlockPlacement.cpp:632
SmallVector.h
llvm::TailDuplicator
Utility class to perform tail duplication.
Definition: TailDuplicator.h:39
llvm::GVDT_None
@ GVDT_None
Definition: BlockFrequencyInfoImpl.h:1806
DEBUG_TYPE
#define DEBUG_TYPE
Definition: MachineBlockPlacement.cpp:78
llvm::SmallVectorImpl::iterator
typename SuperClass::iterator iterator
Definition: SmallVector.h:581
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:222
TailDuplicator.h
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::reserve
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
TailDupPlacementPenalty
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
ApplyExtTspWithoutProfile
cl::opt< bool > ApplyExtTspWithoutProfile
llvm::reverse
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:485
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:307
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::ViewBlockFreqFuncName
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
Definition: MachineBlockFrequencyInfo.cpp:66
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:413
llvm::MachineBasicBlock::updateTerminator
void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
Definition: MachineBasicBlock.cpp:657
raw_ostream.h
TailDupPlacementAggressiveThreshold
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
MachineFunction.h
InitializePasses.h
MachineBlockFrequencyInfo.h
Debug.h
llvm::TargetPassConfig::getOptLevel
CodeGenOpt::Level getOptLevel() const
Definition: TargetPassConfig.cpp:645
llvm::instructionsWithoutDebug
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
Definition: MachineBasicBlock.h:1303
SetVector.h
llvm::SpecificBumpPtrAllocator::DestroyAll
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:403
countMBBInstruction
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
Definition: MachineBlockPlacement.cpp:3150
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365