LLVM  13.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements basic block placement transformations using the CFG
10 // structure and branch probability estimates.
11 //
12 // The pass strives to preserve the structure of the CFG (that is, retain
13 // a topological ordering of basic blocks) in the absence of a *strong* signal
14 // to the contrary from probabilities. However, within the CFG structure, it
15 // attempts to choose an ordering which favors placing more likely sequences of
16 // blocks adjacent to each other.
17 //
18 // The algorithm works from the inner-most loop within a function outward, and
19 // at each stage walks through the basic blocks, trying to coalesce them into
20 // sequential chains where allowed by the CFG (or demanded by heavy
21 // probabilities). Finally, it walks the blocks in topological order, and the
22 // first time it reaches a chain of basic blocks, it schedules them in the
23 // function in-order.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "BranchFolding.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
51 #include "llvm/IR/DebugLoc.h"
52 #include "llvm/IR/Function.h"
53 #include "llvm/InitializePasses.h"
54 #include "llvm/Pass.h"
55 #include "llvm/Support/Allocator.h"
58 #include "llvm/Support/CodeGen.h"
60 #include "llvm/Support/Compiler.h"
61 #include "llvm/Support/Debug.h"
64 #include <algorithm>
65 #include <cassert>
66 #include <cstdint>
67 #include <iterator>
68 #include <memory>
69 #include <string>
70 #include <tuple>
71 #include <utility>
72 #include <vector>
73 
74 using namespace llvm;
75 
76 #define DEBUG_TYPE "block-placement"
77 
78 STATISTIC(NumCondBranches, "Number of conditional branches");
79 STATISTIC(NumUncondBranches, "Number of unconditional branches");
80 STATISTIC(CondBranchTakenFreq,
81  "Potential frequency of taking conditional branches");
82 STATISTIC(UncondBranchTakenFreq,
83  "Potential frequency of taking unconditional branches");
84 
86  "align-all-blocks",
87  cl::desc("Force the alignment of all blocks in the function in log2 format "
88  "(e.g 4 means align on 16B boundaries)."),
89  cl::init(0), cl::Hidden);
90 
92  "align-all-nofallthru-blocks",
93  cl::desc("Force the alignment of all blocks that have no fall-through "
94  "predecessors (i.e. don't add nops that are executed). In log2 "
95  "format (e.g 4 means align on 16B boundaries)."),
96  cl::init(0), cl::Hidden);
97 
98 // FIXME: Find a good default for this flag and remove the flag.
100  "block-placement-exit-block-bias",
101  cl::desc("Block frequency percentage a loop exit block needs "
102  "over the original exit to be considered the new exit."),
103  cl::init(0), cl::Hidden);
104 
105 // Definition:
106 // - Outlining: placement of a basic block outside the chain or hot path.
107 
109  "loop-to-cold-block-ratio",
110  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
111  "(frequency of block) is greater than this ratio"),
112  cl::init(5), cl::Hidden);
113 
115  "force-loop-cold-block",
116  cl::desc("Force outlining cold blocks from loops."),
117  cl::init(false), cl::Hidden);
118 
119 static cl::opt<bool>
120  PreciseRotationCost("precise-rotation-cost",
121  cl::desc("Model the cost of loop rotation more "
122  "precisely by using profile data."),
123  cl::init(false), cl::Hidden);
124 
125 static cl::opt<bool>
126  ForcePreciseRotationCost("force-precise-rotation-cost",
127  cl::desc("Force the use of precise cost "
128  "loop rotation strategy."),
129  cl::init(false), cl::Hidden);
130 
132  "misfetch-cost",
133  cl::desc("Cost that models the probabilistic risk of an instruction "
134  "misfetch due to a jump comparing to falling through, whose cost "
135  "is zero."),
136  cl::init(1), cl::Hidden);
137 
138 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
139  cl::desc("Cost of jump instructions."),
140  cl::init(1), cl::Hidden);
141 static cl::opt<bool>
142 TailDupPlacement("tail-dup-placement",
143  cl::desc("Perform tail duplication during placement. "
144  "Creates more fallthrough opportunites in "
145  "outline branches."),
146  cl::init(true), cl::Hidden);
147 
148 static cl::opt<bool>
149 BranchFoldPlacement("branch-fold-placement",
150  cl::desc("Perform branch folding during placement. "
151  "Reduces code size."),
152  cl::init(true), cl::Hidden);
153 
154 // Heuristic for tail duplication.
156  "tail-dup-placement-threshold",
157  cl::desc("Instruction cutoff for tail duplication during layout. "
158  "Tail merging during layout is forced to have a threshold "
159  "that won't conflict."), cl::init(2),
160  cl::Hidden);
161 
162 // Heuristic for aggressive tail duplication.
164  "tail-dup-placement-aggressive-threshold",
165  cl::desc("Instruction cutoff for aggressive tail duplication during "
166  "layout. Used at -O3. Tail merging during layout is forced to "
167  "have a threshold that won't conflict."), cl::init(4),
168  cl::Hidden);
169 
170 // Heuristic for tail duplication.
172  "tail-dup-placement-penalty",
173  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
174  "Copying can increase fallthrough, but it also increases icache "
175  "pressure. This parameter controls the penalty to account for that. "
176  "Percent as integer."),
177  cl::init(2),
178  cl::Hidden);
179 
180 // Heuristic for tail duplication if profile count is used in cost model.
182  "tail-dup-profile-percent-threshold",
183  cl::desc("If profile count information is used in tail duplication cost "
184  "model, the gained fall through number from tail duplication "
185  "should be at least this percent of hot count."),
186  cl::init(50), cl::Hidden);
187 
188 // Heuristic for triangle chains.
190  "triangle-chain-count",
191  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
192  "triangle tail duplication heuristic to kick in. 0 to disable."),
193  cl::init(2),
194  cl::Hidden);
195 
196 namespace llvm {
199 
200 // Internal option used to control BFI display only after MBP pass.
201 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
202 // -view-block-layout-with-bfi=
204 
205 // Command line option to specify the name of the function for CFG dump
206 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
208 } // namespace llvm
209 
210 namespace {
211 
212 class BlockChain;
213 
214 /// Type for our function-wide basic block -> block chain mapping.
215 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
216 
217 /// A chain of blocks which will be laid out contiguously.
218 ///
219 /// This is the datastructure representing a chain of consecutive blocks that
220 /// are profitable to layout together in order to maximize fallthrough
221 /// probabilities and code locality. We also can use a block chain to represent
222 /// a sequence of basic blocks which have some external (correctness)
223 /// requirement for sequential layout.
224 ///
225 /// Chains can be built around a single basic block and can be merged to grow
226 /// them. They participate in a block-to-chain mapping, which is updated
227 /// automatically as chains are merged together.
228 class BlockChain {
229  /// The sequence of blocks belonging to this chain.
230  ///
231  /// This is the sequence of blocks for a particular chain. These will be laid
232  /// out in-order within the function.
234 
235  /// A handle to the function-wide basic block to block chain mapping.
236  ///
237  /// This is retained in each block chain to simplify the computation of child
238  /// block chains for SCC-formation and iteration. We store the edges to child
239  /// basic blocks, and map them back to their associated chains using this
240  /// structure.
241  BlockToChainMapType &BlockToChain;
242 
243 public:
244  /// Construct a new BlockChain.
245  ///
246  /// This builds a new block chain representing a single basic block in the
247  /// function. It also registers itself as the chain that block participates
248  /// in with the BlockToChain mapping.
249  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
250  : Blocks(1, BB), BlockToChain(BlockToChain) {
251  assert(BB && "Cannot create a chain with a null basic block");
252  BlockToChain[BB] = this;
253  }
254 
255  /// Iterator over blocks within the chain.
258 
259  /// Beginning of blocks within the chain.
260  iterator begin() { return Blocks.begin(); }
261  const_iterator begin() const { return Blocks.begin(); }
262 
263  /// End of blocks within the chain.
264  iterator end() { return Blocks.end(); }
265  const_iterator end() const { return Blocks.end(); }
266 
267  bool remove(MachineBasicBlock* BB) {
268  for(iterator i = begin(); i != end(); ++i) {
269  if (*i == BB) {
270  Blocks.erase(i);
271  return true;
272  }
273  }
274  return false;
275  }
276 
277  /// Merge a block chain into this one.
278  ///
279  /// This routine merges a block chain into this one. It takes care of forming
280  /// a contiguous sequence of basic blocks, updating the edge list, and
281  /// updating the block -> chain mapping. It does not free or tear down the
282  /// old chain, but the old chain's block list is no longer valid.
283  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
284  assert(BB && "Can't merge a null block.");
285  assert(!Blocks.empty() && "Can't merge into an empty chain.");
286 
287  // Fast path in case we don't have a chain already.
288  if (!Chain) {
289  assert(!BlockToChain[BB] &&
290  "Passed chain is null, but BB has entry in BlockToChain.");
291  Blocks.push_back(BB);
292  BlockToChain[BB] = this;
293  return;
294  }
295 
296  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
297  assert(Chain->begin() != Chain->end());
298 
299  // Update the incoming blocks to point to this chain, and add them to the
300  // chain structure.
301  for (MachineBasicBlock *ChainBB : *Chain) {
302  Blocks.push_back(ChainBB);
303  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
304  BlockToChain[ChainBB] = this;
305  }
306  }
307 
308 #ifndef NDEBUG
309  /// Dump the blocks in this chain.
310  LLVM_DUMP_METHOD void dump() {
311  for (MachineBasicBlock *MBB : *this)
312  MBB->dump();
313  }
314 #endif // NDEBUG
315 
316  /// Count of predecessors of any block within the chain which have not
317  /// yet been scheduled. In general, we will delay scheduling this chain
318  /// until those predecessors are scheduled (or we find a sufficiently good
319  /// reason to override this heuristic.) Note that when forming loop chains,
320  /// blocks outside the loop are ignored and treated as if they were already
321  /// scheduled.
322  ///
323  /// Note: This field is reinitialized multiple times - once for each loop,
324  /// and then once for the function as a whole.
325  unsigned UnscheduledPredecessors = 0;
326 };
327 
328 class MachineBlockPlacement : public MachineFunctionPass {
329  /// A type for a block filter set.
330  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
331 
332  /// Pair struct containing basic block and taildup profitability
333  struct BlockAndTailDupResult {
335  bool ShouldTailDup;
336  };
337 
338  /// Triple struct containing edge weight and the edge.
339  struct WeightedEdge {
340  BlockFrequency Weight;
341  MachineBasicBlock *Src;
342  MachineBasicBlock *Dest;
343  };
344 
345  /// work lists of blocks that are ready to be laid out
348 
349  /// Edges that have already been computed as optimal.
351 
352  /// Machine Function
354 
355  /// A handle to the branch probability pass.
356  const MachineBranchProbabilityInfo *MBPI;
357 
358  /// A handle to the function-wide block frequency pass.
359  std::unique_ptr<MBFIWrapper> MBFI;
360 
361  /// A handle to the loop info.
362  MachineLoopInfo *MLI;
363 
364  /// Preferred loop exit.
365  /// Member variable for convenience. It may be removed by duplication deep
366  /// in the call stack.
367  MachineBasicBlock *PreferredLoopExit;
368 
369  /// A handle to the target's instruction info.
370  const TargetInstrInfo *TII;
371 
372  /// A handle to the target's lowering info.
373  const TargetLoweringBase *TLI;
374 
375  /// A handle to the post dominator tree.
377 
378  ProfileSummaryInfo *PSI;
379 
380  /// Duplicator used to duplicate tails during placement.
381  ///
382  /// Placement decisions can open up new tail duplication opportunities, but
383  /// since tail duplication affects placement decisions of later blocks, it
384  /// must be done inline.
385  TailDuplicator TailDup;
386 
387  /// Partial tail duplication threshold.
388  BlockFrequency DupThreshold;
389 
390  /// True: use block profile count to compute tail duplication cost.
391  /// False: use block frequency to compute tail duplication cost.
392  bool UseProfileCount;
393 
394  /// Allocator and owner of BlockChain structures.
395  ///
396  /// We build BlockChains lazily while processing the loop structure of
397  /// a function. To reduce malloc traffic, we allocate them using this
398  /// slab-like allocator, and destroy them after the pass completes. An
399  /// important guarantee is that this allocator produces stable pointers to
400  /// the chains.
402 
403  /// Function wide BasicBlock to BlockChain mapping.
404  ///
405  /// This mapping allows efficiently moving from any given basic block to the
406  /// BlockChain it participates in, if any. We use it to, among other things,
407  /// allow implicitly defining edges between chains as the existing edges
408  /// between basic blocks.
410 
411 #ifndef NDEBUG
412  /// The set of basic blocks that have terminators that cannot be fully
413  /// analyzed. These basic blocks cannot be re-ordered safely by
414  /// MachineBlockPlacement, and we must preserve physical layout of these
415  /// blocks and their successors through the pass.
416  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
417 #endif
418 
419  /// Get block profile count or frequency according to UseProfileCount.
420  /// The return value is used to model tail duplication cost.
421  BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
422  if (UseProfileCount) {
423  auto Count = MBFI->getBlockProfileCount(BB);
424  if (Count)
425  return *Count;
426  else
427  return 0;
428  } else
429  return MBFI->getBlockFreq(BB);
430  }
431 
432  /// Scale the DupThreshold according to basic block size.
433  BlockFrequency scaleThreshold(MachineBasicBlock *BB);
434  void initDupThreshold();
435 
436  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
437  /// if the count goes to 0, add them to the appropriate work list.
438  void markChainSuccessors(
439  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
440  const BlockFilterSet *BlockFilter = nullptr);
441 
442  /// Decrease the UnscheduledPredecessors count for a single block, and
443  /// if the count goes to 0, add them to the appropriate work list.
444  void markBlockSuccessors(
445  const BlockChain &Chain, const MachineBasicBlock *BB,
446  const MachineBasicBlock *LoopHeaderBB,
447  const BlockFilterSet *BlockFilter = nullptr);
448 
450  collectViableSuccessors(
451  const MachineBasicBlock *BB, const BlockChain &Chain,
452  const BlockFilterSet *BlockFilter,
454  bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
455  BlockFilterSet *BlockFilter);
456  void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
458  BlockFilterSet *BlockFilter);
459  bool repeatedlyTailDuplicateBlock(
461  const MachineBasicBlock *LoopHeaderBB,
462  BlockChain &Chain, BlockFilterSet *BlockFilter,
463  MachineFunction::iterator &PrevUnplacedBlockIt);
464  bool maybeTailDuplicateBlock(
466  BlockChain &Chain, BlockFilterSet *BlockFilter,
467  MachineFunction::iterator &PrevUnplacedBlockIt,
468  bool &DuplicatedToLPred);
469  bool hasBetterLayoutPredecessor(
470  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
471  const BlockChain &SuccChain, BranchProbability SuccProb,
472  BranchProbability RealSuccProb, const BlockChain &Chain,
473  const BlockFilterSet *BlockFilter);
474  BlockAndTailDupResult selectBestSuccessor(
475  const MachineBasicBlock *BB, const BlockChain &Chain,
476  const BlockFilterSet *BlockFilter);
477  MachineBasicBlock *selectBestCandidateBlock(
478  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
479  MachineBasicBlock *getFirstUnplacedBlock(
480  const BlockChain &PlacedChain,
481  MachineFunction::iterator &PrevUnplacedBlockIt,
482  const BlockFilterSet *BlockFilter);
483 
484  /// Add a basic block to the work list if it is appropriate.
485  ///
486  /// If the optional parameter BlockFilter is provided, only MBB
487  /// present in the set will be added to the worklist. If nullptr
488  /// is provided, no filtering occurs.
489  void fillWorkLists(const MachineBasicBlock *MBB,
490  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
491  const BlockFilterSet *BlockFilter);
492 
493  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
494  BlockFilterSet *BlockFilter = nullptr);
495  bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
496  const MachineBasicBlock *OldTop);
497  bool hasViableTopFallthrough(const MachineBasicBlock *Top,
498  const BlockFilterSet &LoopBlockSet);
499  BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
500  const BlockFilterSet &LoopBlockSet);
501  BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
502  const MachineBasicBlock *OldTop,
503  const MachineBasicBlock *ExitBB,
504  const BlockFilterSet &LoopBlockSet);
505  MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
506  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
507  MachineBasicBlock *findBestLoopTop(
508  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
509  MachineBasicBlock *findBestLoopExit(
510  const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
511  BlockFrequency &ExitFreq);
512  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
513  void buildLoopChains(const MachineLoop &L);
514  void rotateLoop(
515  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
516  BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
517  void rotateLoopWithProfile(
518  BlockChain &LoopChain, const MachineLoop &L,
519  const BlockFilterSet &LoopBlockSet);
520  void buildCFGChains();
521  void optimizeBranches();
522  void alignBlocks();
523  /// Returns true if a block should be tail-duplicated to increase fallthrough
524  /// opportunities.
525  bool shouldTailDuplicate(MachineBasicBlock *BB);
526  /// Check the edge frequencies to see if tail duplication will increase
527  /// fallthroughs.
528  bool isProfitableToTailDup(
529  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
530  BranchProbability QProb,
531  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
532 
533  /// Check for a trellis layout.
534  bool isTrellis(const MachineBasicBlock *BB,
535  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
536  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
537 
538  /// Get the best successor given a trellis layout.
539  BlockAndTailDupResult getBestTrellisSuccessor(
540  const MachineBasicBlock *BB,
541  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
542  BranchProbability AdjustedSumProb, const BlockChain &Chain,
543  const BlockFilterSet *BlockFilter);
544 
545  /// Get the best pair of non-conflicting edges.
546  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
547  const MachineBasicBlock *BB,
549 
550  /// Returns true if a block can tail duplicate into all unplaced
551  /// predecessors. Filters based on loop.
552  bool canTailDuplicateUnplacedPreds(
553  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
554  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
555 
556  /// Find chains of triangles to tail-duplicate where a global analysis works,
557  /// but a local analysis would not find them.
558  void precomputeTriangleChains();
559 
560 public:
561  static char ID; // Pass identification, replacement for typeid
562 
563  MachineBlockPlacement() : MachineFunctionPass(ID) {
565  }
566 
567  bool runOnMachineFunction(MachineFunction &F) override;
568 
569  bool allowTailDupPlacement() const {
570  assert(F);
571  return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
572  }
573 
574  void getAnalysisUsage(AnalysisUsage &AU) const override {
577  if (TailDupPlacement)
583  }
584 };
585 
586 } // end anonymous namespace
587 
589 
591 
592 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
593  "Branch Probability Basic Block Placement", false, false)
599 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
600  "Branch Probability Basic Block Placement", false, false)
601 
602 #ifndef NDEBUG
603 /// Helper to print the name of a MBB.
604 ///
605 /// Only used by debug logging.
606 static std::string getBlockName(const MachineBasicBlock *BB) {
607  std::string Result;
608  raw_string_ostream OS(Result);
609  OS << printMBBReference(*BB);
610  OS << " ('" << BB->getName() << "')";
611  OS.flush();
612  return Result;
613 }
614 #endif
615 
616 /// Mark a chain's successors as having one fewer preds.
617 ///
618 /// When a chain is being merged into the "placed" chain, this routine will
619 /// quickly walk the successors of each block in the chain and mark them as
620 /// having one fewer active predecessor. It also adds any successors of this
621 /// chain which reach the zero-predecessor state to the appropriate worklist.
622 void MachineBlockPlacement::markChainSuccessors(
623  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
624  const BlockFilterSet *BlockFilter) {
625  // Walk all the blocks in this chain, marking their successors as having
626  // a predecessor placed.
627  for (MachineBasicBlock *MBB : Chain) {
628  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
629  }
630 }
631 
632 /// Mark a single block's successors as having one fewer preds.
633 ///
634 /// Under normal circumstances, this is only called by markChainSuccessors,
635 /// but if a block that was to be placed is completely tail-duplicated away,
636 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
637 /// for just that block.
638 void MachineBlockPlacement::markBlockSuccessors(
639  const BlockChain &Chain, const MachineBasicBlock *MBB,
640  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
641  // Add any successors for which this is the only un-placed in-loop
642  // predecessor to the worklist as a viable candidate for CFG-neutral
643  // placement. No subsequent placement of this block will violate the CFG
644  // shape, so we get to use heuristics to choose a favorable placement.
645  for (MachineBasicBlock *Succ : MBB->successors()) {
646  if (BlockFilter && !BlockFilter->count(Succ))
647  continue;
648  BlockChain &SuccChain = *BlockToChain[Succ];
649  // Disregard edges within a fixed chain, or edges to the loop header.
650  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
651  continue;
652 
653  // This is a cross-chain edge that is within the loop, so decrement the
654  // loop predecessor count of the destination chain.
655  if (SuccChain.UnscheduledPredecessors == 0 ||
656  --SuccChain.UnscheduledPredecessors > 0)
657  continue;
658 
659  auto *NewBB = *SuccChain.begin();
660  if (NewBB->isEHPad())
661  EHPadWorkList.push_back(NewBB);
662  else
663  BlockWorkList.push_back(NewBB);
664  }
665 }
666 
667 /// This helper function collects the set of successors of block
668 /// \p BB that are allowed to be its layout successors, and return
669 /// the total branch probability of edges from \p BB to those
670 /// blocks.
671 BranchProbability MachineBlockPlacement::collectViableSuccessors(
672  const MachineBasicBlock *BB, const BlockChain &Chain,
673  const BlockFilterSet *BlockFilter,
675  // Adjust edge probabilities by excluding edges pointing to blocks that is
676  // either not in BlockFilter or is already in the current chain. Consider the
677  // following CFG:
678  //
679  // --->A
680  // | / \
681  // | B C
682  // | \ / \
683  // ----D E
684  //
685  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
686  // A->C is chosen as a fall-through, D won't be selected as a successor of C
687  // due to CFG constraint (the probability of C->D is not greater than
688  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
689  // when calculating the probability of C->D, D will be selected and we
690  // will get A C D B as the layout of this loop.
691  auto AdjustedSumProb = BranchProbability::getOne();
692  for (MachineBasicBlock *Succ : BB->successors()) {
693  bool SkipSucc = false;
694  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
695  SkipSucc = true;
696  } else {
697  BlockChain *SuccChain = BlockToChain[Succ];
698  if (SuccChain == &Chain) {
699  SkipSucc = true;
700  } else if (Succ != *SuccChain->begin()) {
701  LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
702  << " -> Mid chain!\n");
703  continue;
704  }
705  }
706  if (SkipSucc)
707  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
708  else
709  Successors.push_back(Succ);
710  }
711 
712  return AdjustedSumProb;
713 }
714 
715 /// The helper function returns the branch probability that is adjusted
716 /// or normalized over the new total \p AdjustedSumProb.
717 static BranchProbability
719  BranchProbability AdjustedSumProb) {
720  BranchProbability SuccProb;
721  uint32_t SuccProbN = OrigProb.getNumerator();
722  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
723  if (SuccProbN >= SuccProbD)
724  SuccProb = BranchProbability::getOne();
725  else
726  SuccProb = BranchProbability(SuccProbN, SuccProbD);
727 
728  return SuccProb;
729 }
730 
731 /// Check if \p BB has exactly the successors in \p Successors.
732 static bool
735  if (BB.succ_size() != Successors.size())
736  return false;
737  // We don't want to count self-loops
738  if (Successors.count(&BB))
739  return false;
740  for (MachineBasicBlock *Succ : BB.successors())
741  if (!Successors.count(Succ))
742  return false;
743  return true;
744 }
745 
746 /// Check if a block should be tail duplicated to increase fallthrough
747 /// opportunities.
748 /// \p BB Block to check.
749 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
750  // Blocks with single successors don't create additional fallthrough
751  // opportunities. Don't duplicate them. TODO: When conditional exits are
752  // analyzable, allow them to be duplicated.
753  bool IsSimple = TailDup.isSimpleBB(BB);
754 
755  if (BB->succ_size() == 1)
756  return false;
757  return TailDup.shouldTailDuplicate(IsSimple, *BB);
758 }
759 
760 /// Compare 2 BlockFrequency's with a small penalty for \p A.
761 /// In order to be conservative, we apply a X% penalty to account for
762 /// increased icache pressure and static heuristics. For small frequencies
763 /// we use only the numerators to improve accuracy. For simplicity, we assume the
764 /// penalty is less than 100%
765 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
767  uint64_t EntryFreq) {
768  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
769  BlockFrequency Gain = A - B;
770  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
771 }
772 
773 /// Check the edge frequencies to see if tail duplication will increase
774 /// fallthroughs. It only makes sense to call this function when
775 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
776 /// always locally profitable if we would have picked \p Succ without
777 /// considering duplication.
778 bool MachineBlockPlacement::isProfitableToTailDup(
779  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
780  BranchProbability QProb,
781  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
782  // We need to do a probability calculation to make sure this is profitable.
783  // First: does succ have a successor that post-dominates? This affects the
784  // calculation. The 2 relevant cases are:
785  // BB BB
786  // | \Qout | \Qout
787  // P| C |P C
788  // = C' = C'
789  // | /Qin | /Qin
790  // | / | /
791  // Succ Succ
792  // / \ | \ V
793  // U/ =V |U \
794  // / \ = D
795  // D E | /
796  // | /
797  // |/
798  // PDom
799  // '=' : Branch taken for that CFG edge
800  // In the second case, Placing Succ while duplicating it into C prevents the
801  // fallthrough of Succ into either D or PDom, because they now have C as an
802  // unplaced predecessor
803 
804  // Start by figuring out which case we fall into
805  MachineBasicBlock *PDom = nullptr;
807  // Only scan the relevant successors
808  auto AdjustedSuccSumProb =
809  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
810  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
811  auto BBFreq = MBFI->getBlockFreq(BB);
812  auto SuccFreq = MBFI->getBlockFreq(Succ);
813  BlockFrequency P = BBFreq * PProb;
814  BlockFrequency Qout = BBFreq * QProb;
815  uint64_t EntryFreq = MBFI->getEntryFreq();
816  // If there are no more successors, it is profitable to copy, as it strictly
817  // increases fallthrough.
818  if (SuccSuccs.size() == 0)
819  return greaterWithBias(P, Qout, EntryFreq);
820 
821  auto BestSuccSucc = BranchProbability::getZero();
822  // Find the PDom or the best Succ if no PDom exists.
823  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
824  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
825  if (Prob > BestSuccSucc)
826  BestSuccSucc = Prob;
827  if (PDom == nullptr)
828  if (MPDT->dominates(SuccSucc, Succ)) {
829  PDom = SuccSucc;
830  break;
831  }
832  }
833  // For the comparisons, we need to know Succ's best incoming edge that isn't
834  // from BB.
835  auto SuccBestPred = BlockFrequency(0);
836  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
837  if (SuccPred == Succ || SuccPred == BB
838  || BlockToChain[SuccPred] == &Chain
839  || (BlockFilter && !BlockFilter->count(SuccPred)))
840  continue;
841  auto Freq = MBFI->getBlockFreq(SuccPred)
842  * MBPI->getEdgeProbability(SuccPred, Succ);
843  if (Freq > SuccBestPred)
844  SuccBestPred = Freq;
845  }
846  // Qin is Succ's best unplaced incoming edge that isn't BB
847  BlockFrequency Qin = SuccBestPred;
848  // If it doesn't have a post-dominating successor, here is the calculation:
849  // BB BB
850  // | \Qout | \
851  // P| C | =
852  // = C' | C
853  // | /Qin | |
854  // | / | C' (+Succ)
855  // Succ Succ /|
856  // / \ | \/ |
857  // U/ =V | == |
858  // / \ | / \|
859  // D E D E
860  // '=' : Branch taken for that CFG edge
861  // Cost in the first case is: P + V
862  // For this calculation, we always assume P > Qout. If Qout > P
863  // The result of this function will be ignored at the caller.
864  // Let F = SuccFreq - Qin
865  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
866 
867  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
868  BranchProbability UProb = BestSuccSucc;
869  BranchProbability VProb = AdjustedSuccSumProb - UProb;
870  BlockFrequency F = SuccFreq - Qin;
871  BlockFrequency V = SuccFreq * VProb;
872  BlockFrequency QinU = std::min(Qin, F) * UProb;
873  BlockFrequency BaseCost = P + V;
874  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
875  return greaterWithBias(BaseCost, DupCost, EntryFreq);
876  }
877  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
878  BranchProbability VProb = AdjustedSuccSumProb - UProb;
879  BlockFrequency U = SuccFreq * UProb;
880  BlockFrequency V = SuccFreq * VProb;
881  BlockFrequency F = SuccFreq - Qin;
882  // If there is a post-dominating successor, here is the calculation:
883  // BB BB BB BB
884  // | \Qout | \ | \Qout | \
885  // |P C | = |P C | =
886  // = C' |P C = C' |P C
887  // | /Qin | | | /Qin | |
888  // | / | C' (+Succ) | / | C' (+Succ)
889  // Succ Succ /| Succ Succ /|
890  // | \ V | \/ | | \ V | \/ |
891  // |U \ |U /\ =? |U = |U /\ |
892  // = D = = =?| | D | = =|
893  // | / |/ D | / |/ D
894  // | / | / | = | /
895  // |/ | / |/ | =
896  // Dom Dom Dom Dom
897  // '=' : Branch taken for that CFG edge
898  // The cost for taken branches in the first case is P + U
899  // Let F = SuccFreq - Qin
900  // The cost in the second case (assuming independence), given the layout:
901  // BB, Succ, (C+Succ), D, Dom or the layout:
902  // BB, Succ, D, Dom, (C+Succ)
903  // is Qout + max(F, Qin) * U + min(F, Qin)
904  // compare P + U vs Qout + P * U + Qin.
905  //
906  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
907  //
908  // For the 3rd case, the cost is P + 2 * V
909  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
910  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
911  if (UProb > AdjustedSuccSumProb / 2 &&
912  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
913  Chain, BlockFilter))
914  // Cases 3 & 4
915  return greaterWithBias(
916  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
917  EntryFreq);
918  // Cases 1 & 2
919  return greaterWithBias((P + U),
920  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
921  std::max(Qin, F) * UProb),
922  EntryFreq);
923 }
924 
925 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
926 /// successors form the lower part of a trellis. A successor set S forms the
927 /// lower part of a trellis if all of the predecessors of S are either in S or
928 /// have all of S as successors. We ignore trellises where BB doesn't have 2
929 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
930 /// are very uncommon and complex to compute optimally. Allowing edges within S
931 /// is not strictly a trellis, but the same algorithm works, so we allow it.
932 bool MachineBlockPlacement::isTrellis(
933  const MachineBasicBlock *BB,
934  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
935  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
936  // Technically BB could form a trellis with branching factor higher than 2.
937  // But that's extremely uncommon.
938  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
939  return false;
940 
941  SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
942  BB->succ_end());
943  // To avoid reviewing the same predecessors twice.
945 
946  for (MachineBasicBlock *Succ : ViableSuccs) {
947  int PredCount = 0;
948  for (auto SuccPred : Succ->predecessors()) {
949  // Allow triangle successors, but don't count them.
950  if (Successors.count(SuccPred)) {
951  // Make sure that it is actually a triangle.
952  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
953  if (!Successors.count(CheckSucc))
954  return false;
955  continue;
956  }
957  const BlockChain *PredChain = BlockToChain[SuccPred];
958  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
959  PredChain == &Chain || PredChain == BlockToChain[Succ])
960  continue;
961  ++PredCount;
962  // Perform the successor check only once.
963  if (!SeenPreds.insert(SuccPred).second)
964  continue;
965  if (!hasSameSuccessors(*SuccPred, Successors))
966  return false;
967  }
968  // If one of the successors has only BB as a predecessor, it is not a
969  // trellis.
970  if (PredCount < 1)
971  return false;
972  }
973  return true;
974 }
975 
976 /// Pick the highest total weight pair of edges that can both be laid out.
977 /// The edges in \p Edges[0] are assumed to have a different destination than
978 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
979 /// the individual highest weight edges to the 2 different destinations, or in
980 /// case of a conflict, one of them should be replaced with a 2nd best edge.
981 std::pair<MachineBlockPlacement::WeightedEdge,
982  MachineBlockPlacement::WeightedEdge>
983 MachineBlockPlacement::getBestNonConflictingEdges(
984  const MachineBasicBlock *BB,
986  Edges) {
987  // Sort the edges, and then for each successor, find the best incoming
988  // predecessor. If the best incoming predecessors aren't the same,
989  // then that is clearly the best layout. If there is a conflict, one of the
990  // successors will have to fallthrough from the second best predecessor. We
991  // compare which combination is better overall.
992 
993  // Sort for highest frequency.
994  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
995 
996  llvm::stable_sort(Edges[0], Cmp);
997  llvm::stable_sort(Edges[1], Cmp);
998  auto BestA = Edges[0].begin();
999  auto BestB = Edges[1].begin();
1000  // Arrange for the correct answer to be in BestA and BestB
1001  // If the 2 best edges don't conflict, the answer is already there.
1002  if (BestA->Src == BestB->Src) {
1003  // Compare the total fallthrough of (Best + Second Best) for both pairs
1004  auto SecondBestA = std::next(BestA);
1005  auto SecondBestB = std::next(BestB);
1006  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1007  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1008  if (BestAScore < BestBScore)
1009  BestA = SecondBestA;
1010  else
1011  BestB = SecondBestB;
1012  }
1013  // Arrange for the BB edge to be in BestA if it exists.
1014  if (BestB->Src == BB)
1015  std::swap(BestA, BestB);
1016  return std::make_pair(*BestA, *BestB);
1017 }
1018 
1019 /// Get the best successor from \p BB based on \p BB being part of a trellis.
1020 /// We only handle trellises with 2 successors, so the algorithm is
1021 /// straightforward: Find the best pair of edges that don't conflict. We find
1022 /// the best incoming edge for each successor in the trellis. If those conflict,
1023 /// we consider which of them should be replaced with the second best.
1024 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
1025 /// comes from \p BB, it will be in \p BestEdges[0]
1026 MachineBlockPlacement::BlockAndTailDupResult
1027 MachineBlockPlacement::getBestTrellisSuccessor(
1028  const MachineBasicBlock *BB,
1029  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1030  BranchProbability AdjustedSumProb, const BlockChain &Chain,
1031  const BlockFilterSet *BlockFilter) {
1032 
1033  BlockAndTailDupResult Result = {nullptr, false};
1034  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
1035  BB->succ_end());
1036 
1037  // We assume size 2 because it's common. For general n, we would have to do
1038  // the Hungarian algorithm, but it's not worth the complexity because more
1039  // than 2 successors is fairly uncommon, and a trellis even more so.
1040  if (Successors.size() != 2 || ViableSuccs.size() != 2)
1041  return Result;
1042 
1043  // Collect the edge frequencies of all edges that form the trellis.
1045  int SuccIndex = 0;
1046  for (auto Succ : ViableSuccs) {
1047  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1048  // Skip any placed predecessors that are not BB
1049  if (SuccPred != BB)
1050  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1051  BlockToChain[SuccPred] == &Chain ||
1052  BlockToChain[SuccPred] == BlockToChain[Succ])
1053  continue;
1054  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1055  MBPI->getEdgeProbability(SuccPred, Succ);
1056  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1057  }
1058  ++SuccIndex;
1059  }
1060 
1061  // Pick the best combination of 2 edges from all the edges in the trellis.
1062  WeightedEdge BestA, BestB;
1063  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1064 
1065  if (BestA.Src != BB) {
1066  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1067  // we shouldn't choose any successor. We've already looked and there's a
1068  // better fallthrough edge for all the successors.
1069  LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1070  return Result;
1071  }
1072 
1073  // Did we pick the triangle edge? If tail-duplication is profitable, do
1074  // that instead. Otherwise merge the triangle edge now while we know it is
1075  // optimal.
1076  if (BestA.Dest == BestB.Src) {
1077  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1078  // would be better.
1079  MachineBasicBlock *Succ1 = BestA.Dest;
1080  MachineBasicBlock *Succ2 = BestB.Dest;
1081  // Check to see if tail-duplication would be profitable.
1082  if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1083  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1084  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1085  Chain, BlockFilter)) {
1087  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1088  dbgs() << " Selected: " << getBlockName(Succ2)
1089  << ", probability: " << Succ2Prob
1090  << " (Tail Duplicate)\n");
1091  Result.BB = Succ2;
1092  Result.ShouldTailDup = true;
1093  return Result;
1094  }
1095  }
1096  // We have already computed the optimal edge for the other side of the
1097  // trellis.
1098  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1099 
1100  auto TrellisSucc = BestA.Dest;
1102  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1103  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1104  << ", probability: " << SuccProb << " (Trellis)\n");
1105  Result.BB = TrellisSucc;
1106  return Result;
1107 }
1108 
1109 /// When the option allowTailDupPlacement() is on, this method checks if the
1110 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1111 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1112 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1113  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1114  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1115  if (!shouldTailDuplicate(Succ))
1116  return false;
1117 
1118  // The result of canTailDuplicate.
1119  bool Duplicate = true;
1120  // Number of possible duplication.
1121  unsigned int NumDup = 0;
1122 
1123  // For CFG checking.
1124  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
1125  BB->succ_end());
1126  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1127  // Make sure all unplaced and unfiltered predecessors can be
1128  // tail-duplicated into.
1129  // Skip any blocks that are already placed or not in this loop.
1130  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1131  || BlockToChain[Pred] == &Chain)
1132  continue;
1133  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1134  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1135  // This will result in a trellis after tail duplication, so we don't
1136  // need to copy Succ into this predecessor. In the presence
1137  // of a trellis tail duplication can continue to be profitable.
1138  // For example:
1139  // A A
1140  // |\ |\
1141  // | \ | \
1142  // | C | C+BB
1143  // | / | |
1144  // |/ | |
1145  // BB => BB |
1146  // |\ |\/|
1147  // | \ |/\|
1148  // | D | D
1149  // | / | /
1150  // |/ |/
1151  // Succ Succ
1152  //
1153  // After BB was duplicated into C, the layout looks like the one on the
1154  // right. BB and C now have the same successors. When considering
1155  // whether Succ can be duplicated into all its unplaced predecessors, we
1156  // ignore C.
1157  // We can do this because C already has a profitable fallthrough, namely
1158  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1159  // duplication and for this test.
1160  //
1161  // This allows trellises to be laid out in 2 separate chains
1162  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1163  // because it allows the creation of 2 fallthrough paths with links
1164  // between them, and we correctly identify the best layout for these
1165  // CFGs. We want to extend trellises that the user created in addition
1166  // to trellises created by tail-duplication, so we just look for the
1167  // CFG.
1168  continue;
1169  Duplicate = false;
1170  continue;
1171  }
1172  NumDup++;
1173  }
1174 
1175  // No possible duplication in current filter set.
1176  if (NumDup == 0)
1177  return false;
1178 
1179  // If profile information is available, findDuplicateCandidates can do more
1180  // precise benefit analysis.
1181  if (F->getFunction().hasProfileData())
1182  return true;
1183 
1184  // This is mainly for function exit BB.
1185  // The integrated tail duplication is really designed for increasing
1186  // fallthrough from predecessors from Succ to its successors. We may need
1187  // other machanism to handle different cases.
1188  if (Succ->succ_size() == 0)
1189  return true;
1190 
1191  // Plus the already placed predecessor.
1192  NumDup++;
1193 
1194  // If the duplication candidate has more unplaced predecessors than
1195  // successors, the extra duplication can't bring more fallthrough.
1196  //
1197  // Pred1 Pred2 Pred3
1198  // \ | /
1199  // \ | /
1200  // \ | /
1201  // Dup
1202  // / \
1203  // / \
1204  // Succ1 Succ2
1205  //
1206  // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1207  // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1208  // but the duplication into Pred3 can't increase fallthrough.
1209  //
1210  // A small number of extra duplication may not hurt too much. We need a better
1211  // heuristic to handle it.
1212  if ((NumDup > Succ->succ_size()) || !Duplicate)
1213  return false;
1214 
1215  return true;
1216 }
1217 
1218 /// Find chains of triangles where we believe it would be profitable to
1219 /// tail-duplicate them all, but a local analysis would not find them.
1220 /// There are 3 ways this can be profitable:
1221 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1222 /// longer chains)
1223 /// 2) The chains are statically correlated. Branch probabilities have a very
1224 /// U-shaped distribution.
1225 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1226 /// If the branches in a chain are likely to be from the same side of the
1227 /// distribution as their predecessor, but are independent at runtime, this
1228 /// transformation is profitable. (Because the cost of being wrong is a small
1229 /// fixed cost, unlike the standard triangle layout where the cost of being
1230 /// wrong scales with the # of triangles.)
1231 /// 3) The chains are dynamically correlated. If the probability that a previous
1232 /// branch was taken positively influences whether the next branch will be
1233 /// taken
1234 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1235 void MachineBlockPlacement::precomputeTriangleChains() {
1236  struct TriangleChain {
1237  std::vector<MachineBasicBlock *> Edges;
1238 
1239  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1240  : Edges({src, dst}) {}
1241 
1242  void append(MachineBasicBlock *dst) {
1243  assert(getKey()->isSuccessor(dst) &&
1244  "Attempting to append a block that is not a successor.");
1245  Edges.push_back(dst);
1246  }
1247 
1248  unsigned count() const { return Edges.size() - 1; }
1249 
1250  MachineBasicBlock *getKey() const {
1251  return Edges.back();
1252  }
1253  };
1254 
1255  if (TriangleChainCount == 0)
1256  return;
1257 
1258  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1259  // Map from last block to the chain that contains it. This allows us to extend
1260  // chains as we find new triangles.
1262  for (MachineBasicBlock &BB : *F) {
1263  // If BB doesn't have 2 successors, it doesn't start a triangle.
1264  if (BB.succ_size() != 2)
1265  continue;
1266  MachineBasicBlock *PDom = nullptr;
1267  for (MachineBasicBlock *Succ : BB.successors()) {
1268  if (!MPDT->dominates(Succ, &BB))
1269  continue;
1270  PDom = Succ;
1271  break;
1272  }
1273  // If BB doesn't have a post-dominating successor, it doesn't form a
1274  // triangle.
1275  if (PDom == nullptr)
1276  continue;
1277  // If PDom has a hint that it is low probability, skip this triangle.
1278  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1279  continue;
1280  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1281  // we're looking for.
1282  if (!shouldTailDuplicate(PDom))
1283  continue;
1284  bool CanTailDuplicate = true;
1285  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1286  // isn't the kind of triangle we're looking for.
1287  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1288  if (Pred == &BB)
1289  continue;
1290  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1291  CanTailDuplicate = false;
1292  break;
1293  }
1294  }
1295  // If we can't tail-duplicate PDom to its predecessors, then skip this
1296  // triangle.
1297  if (!CanTailDuplicate)
1298  continue;
1299 
1300  // Now we have an interesting triangle. Insert it if it's not part of an
1301  // existing chain.
1302  // Note: This cannot be replaced with a call insert() or emplace() because
1303  // the find key is BB, but the insert/emplace key is PDom.
1304  auto Found = TriangleChainMap.find(&BB);
1305  // If it is, remove the chain from the map, grow it, and put it back in the
1306  // map with the end as the new key.
1307  if (Found != TriangleChainMap.end()) {
1308  TriangleChain Chain = std::move(Found->second);
1309  TriangleChainMap.erase(Found);
1310  Chain.append(PDom);
1311  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1312  } else {
1313  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1314  assert(InsertResult.second && "Block seen twice.");
1315  (void)InsertResult;
1316  }
1317  }
1318 
1319  // Iterating over a DenseMap is safe here, because the only thing in the body
1320  // of the loop is inserting into another DenseMap (ComputedEdges).
1321  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1322  for (auto &ChainPair : TriangleChainMap) {
1323  TriangleChain &Chain = ChainPair.second;
1324  // Benchmarking has shown that due to branch correlation duplicating 2 or
1325  // more triangles is profitable, despite the calculations assuming
1326  // independence.
1327  if (Chain.count() < TriangleChainCount)
1328  continue;
1329  MachineBasicBlock *dst = Chain.Edges.back();
1330  Chain.Edges.pop_back();
1331  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1332  LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1333  << getBlockName(dst)
1334  << " as pre-computed based on triangles.\n");
1335 
1336  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1337  assert(InsertResult.second && "Block seen twice.");
1338  (void)InsertResult;
1339 
1340  dst = src;
1341  }
1342  }
1343 }
1344 
1345 // When profile is not present, return the StaticLikelyProb.
1346 // When profile is available, we need to handle the triangle-shape CFG.
1348  const MachineBasicBlock *BB) {
1349  if (!BB->getParent()->getFunction().hasProfileData())
1350  return BranchProbability(StaticLikelyProb, 100);
1351  if (BB->succ_size() == 2) {
1352  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1353  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1354  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1355  /* See case 1 below for the cost analysis. For BB->Succ to
1356  * be taken with smaller cost, the following needs to hold:
1357  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1358  * So the threshold T in the calculation below
1359  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1360  * So T / (1 - T) = 2, Yielding T = 2/3
1361  * Also adding user specified branch bias, we have
1362  * T = (2/3)*(ProfileLikelyProb/50)
1363  * = (2*ProfileLikelyProb)/150)
1364  */
1365  return BranchProbability(2 * ProfileLikelyProb, 150);
1366  }
1367  }
1368  return BranchProbability(ProfileLikelyProb, 100);
1369 }
1370 
1371 /// Checks to see if the layout candidate block \p Succ has a better layout
1372 /// predecessor than \c BB. If yes, returns true.
1373 /// \p SuccProb: The probability adjusted for only remaining blocks.
1374 /// Only used for logging
1375 /// \p RealSuccProb: The un-adjusted probability.
1376 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1377 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1378 /// considered
1379 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1380  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1381  const BlockChain &SuccChain, BranchProbability SuccProb,
1382  BranchProbability RealSuccProb, const BlockChain &Chain,
1383  const BlockFilterSet *BlockFilter) {
1384 
1385  // There isn't a better layout when there are no unscheduled predecessors.
1386  if (SuccChain.UnscheduledPredecessors == 0)
1387  return false;
1388 
1389  // There are two basic scenarios here:
1390  // -------------------------------------
1391  // Case 1: triangular shape CFG (if-then):
1392  // BB
1393  // | \
1394  // | \
1395  // | Pred
1396  // | /
1397  // Succ
1398  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1399  // set Succ as the layout successor of BB. Picking Succ as BB's
1400  // successor breaks the CFG constraints (FIXME: define these constraints).
1401  // With this layout, Pred BB
1402  // is forced to be outlined, so the overall cost will be cost of the
1403  // branch taken from BB to Pred, plus the cost of back taken branch
1404  // from Pred to Succ, as well as the additional cost associated
1405  // with the needed unconditional jump instruction from Pred To Succ.
1406 
1407  // The cost of the topological order layout is the taken branch cost
1408  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1409  // must hold:
1410  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1411  // < freq(BB->Succ) * taken_branch_cost.
1412  // Ignoring unconditional jump cost, we get
1413  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1414  // prob(BB->Succ) > 2 * prob(BB->Pred)
1415  //
1416  // When real profile data is available, we can precisely compute the
1417  // probability threshold that is needed for edge BB->Succ to be considered.
1418  // Without profile data, the heuristic requires the branch bias to be
1419  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1420  // -----------------------------------------------------------------
1421  // Case 2: diamond like CFG (if-then-else):
1422  // S
1423  // / \
1424  // | \
1425  // BB Pred
1426  // \ /
1427  // Succ
1428  // ..
1429  //
1430  // The current block is BB and edge BB->Succ is now being evaluated.
1431  // Note that edge S->BB was previously already selected because
1432  // prob(S->BB) > prob(S->Pred).
1433  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1434  // choose Pred, we will have a topological ordering as shown on the left
1435  // in the picture below. If we choose Succ, we have the solution as shown
1436  // on the right:
1437  //
1438  // topo-order:
1439  //
1440  // S----- ---S
1441  // | | | |
1442  // ---BB | | BB
1443  // | | | |
1444  // | Pred-- | Succ--
1445  // | | | |
1446  // ---Succ ---Pred--
1447  //
1448  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1449  // = freq(S->Pred) + freq(S->BB)
1450  //
1451  // If we have profile data (i.e, branch probabilities can be trusted), the
1452  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1453  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1454  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1455  // means the cost of topological order is greater.
1456  // When profile data is not available, however, we need to be more
1457  // conservative. If the branch prediction is wrong, breaking the topo-order
1458  // will actually yield a layout with large cost. For this reason, we need
1459  // strong biased branch at block S with Prob(S->BB) in order to select
1460  // BB->Succ. This is equivalent to looking the CFG backward with backward
1461  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1462  // profile data).
1463  // --------------------------------------------------------------------------
1464  // Case 3: forked diamond
1465  // S
1466  // / \
1467  // / \
1468  // BB Pred
1469  // | \ / |
1470  // | \ / |
1471  // | X |
1472  // | / \ |
1473  // | / \ |
1474  // S1 S2
1475  //
1476  // The current block is BB and edge BB->S1 is now being evaluated.
1477  // As above S->BB was already selected because
1478  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1479  //
1480  // topo-order:
1481  //
1482  // S-------| ---S
1483  // | | | |
1484  // ---BB | | BB
1485  // | | | |
1486  // | Pred----| | S1----
1487  // | | | |
1488  // --(S1 or S2) ---Pred--
1489  // |
1490  // S2
1491  //
1492  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1493  // + min(freq(Pred->S1), freq(Pred->S2))
1494  // Non-topo-order cost:
1495  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1496  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1497  // is 0. Then the non topo layout is better when
1498  // freq(S->Pred) < freq(BB->S1).
1499  // This is exactly what is checked below.
1500  // Note there are other shapes that apply (Pred may not be a single block,
1501  // but they all fit this general pattern.)
1503 
1504  // Make sure that a hot successor doesn't have a globally more
1505  // important predecessor.
1506  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1507  bool BadCFGConflict = false;
1508 
1509  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1510  BlockChain *PredChain = BlockToChain[Pred];
1511  if (Pred == Succ || PredChain == &SuccChain ||
1512  (BlockFilter && !BlockFilter->count(Pred)) ||
1513  PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
1514  // This check is redundant except for look ahead. This function is
1515  // called for lookahead by isProfitableToTailDup when BB hasn't been
1516  // placed yet.
1517  (Pred == BB))
1518  continue;
1519  // Do backward checking.
1520  // For all cases above, we need a backward checking to filter out edges that
1521  // are not 'strongly' biased.
1522  // BB Pred
1523  // \ /
1524  // Succ
1525  // We select edge BB->Succ if
1526  // freq(BB->Succ) > freq(Succ) * HotProb
1527  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1528  // HotProb
1529  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1530  // Case 1 is covered too, because the first equation reduces to:
1531  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1532  BlockFrequency PredEdgeFreq =
1533  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1534  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1535  BadCFGConflict = true;
1536  break;
1537  }
1538  }
1539 
1540  if (BadCFGConflict) {
1541  LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1542  << SuccProb << " (prob) (non-cold CFG conflict)\n");
1543  return true;
1544  }
1545 
1546  return false;
1547 }
1548 
1549 /// Select the best successor for a block.
1550 ///
1551 /// This looks across all successors of a particular block and attempts to
1552 /// select the "best" one to be the layout successor. It only considers direct
1553 /// successors which also pass the block filter. It will attempt to avoid
1554 /// breaking CFG structure, but cave and break such structures in the case of
1555 /// very hot successor edges.
1556 ///
1557 /// \returns The best successor block found, or null if none are viable, along
1558 /// with a boolean indicating if tail duplication is necessary.
1559 MachineBlockPlacement::BlockAndTailDupResult
1560 MachineBlockPlacement::selectBestSuccessor(
1561  const MachineBasicBlock *BB, const BlockChain &Chain,
1562  const BlockFilterSet *BlockFilter) {
1563  const BranchProbability HotProb(StaticLikelyProb, 100);
1564 
1565  BlockAndTailDupResult BestSucc = { nullptr, false };
1566  auto BestProb = BranchProbability::getZero();
1567 
1569  auto AdjustedSumProb =
1570  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1571 
1572  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1573  << "\n");
1574 
1575  // if we already precomputed the best successor for BB, return that if still
1576  // applicable.
1577  auto FoundEdge = ComputedEdges.find(BB);
1578  if (FoundEdge != ComputedEdges.end()) {
1579  MachineBasicBlock *Succ = FoundEdge->second.BB;
1580  ComputedEdges.erase(FoundEdge);
1581  BlockChain *SuccChain = BlockToChain[Succ];
1582  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1583  SuccChain != &Chain && Succ == *SuccChain->begin())
1584  return FoundEdge->second;
1585  }
1586 
1587  // if BB is part of a trellis, Use the trellis to determine the optimal
1588  // fallthrough edges
1589  if (isTrellis(BB, Successors, Chain, BlockFilter))
1590  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1591  BlockFilter);
1592 
1593  // For blocks with CFG violations, we may be able to lay them out anyway with
1594  // tail-duplication. We keep this vector so we can perform the probability
1595  // calculations the minimum number of times.
1597  DupCandidates;
1598  for (MachineBasicBlock *Succ : Successors) {
1599  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1600  BranchProbability SuccProb =
1601  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1602 
1603  BlockChain &SuccChain = *BlockToChain[Succ];
1604  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1605  // predecessor that yields lower global cost.
1606  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1607  Chain, BlockFilter)) {
1608  // If tail duplication would make Succ profitable, place it.
1609  if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1610  DupCandidates.emplace_back(SuccProb, Succ);
1611  continue;
1612  }
1613 
1614  LLVM_DEBUG(
1615  dbgs() << " Candidate: " << getBlockName(Succ)
1616  << ", probability: " << SuccProb
1617  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1618  << "\n");
1619 
1620  if (BestSucc.BB && BestProb >= SuccProb) {
1621  LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1622  continue;
1623  }
1624 
1625  LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1626  BestSucc.BB = Succ;
1627  BestProb = SuccProb;
1628  }
1629  // Handle the tail duplication candidates in order of decreasing probability.
1630  // Stop at the first one that is profitable. Also stop if they are less
1631  // profitable than BestSucc. Position is important because we preserve it and
1632  // prefer first best match. Here we aren't comparing in order, so we capture
1633  // the position instead.
1634  llvm::stable_sort(DupCandidates,
1635  [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1636  std::tuple<BranchProbability, MachineBasicBlock *> R) {
1637  return std::get<0>(L) > std::get<0>(R);
1638  });
1639  for (auto &Tup : DupCandidates) {
1640  BranchProbability DupProb;
1641  MachineBasicBlock *Succ;
1642  std::tie(DupProb, Succ) = Tup;
1643  if (DupProb < BestProb)
1644  break;
1645  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1646  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1647  LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1648  << ", probability: " << DupProb
1649  << " (Tail Duplicate)\n");
1650  BestSucc.BB = Succ;
1651  BestSucc.ShouldTailDup = true;
1652  break;
1653  }
1654  }
1655 
1656  if (BestSucc.BB)
1657  LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1658 
1659  return BestSucc;
1660 }
1661 
1662 /// Select the best block from a worklist.
1663 ///
1664 /// This looks through the provided worklist as a list of candidate basic
1665 /// blocks and select the most profitable one to place. The definition of
1666 /// profitable only really makes sense in the context of a loop. This returns
1667 /// the most frequently visited block in the worklist, which in the case of
1668 /// a loop, is the one most desirable to be physically close to the rest of the
1669 /// loop body in order to improve i-cache behavior.
1670 ///
1671 /// \returns The best block found, or null if none are viable.
1672 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1673  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1674  // Once we need to walk the worklist looking for a candidate, cleanup the
1675  // worklist of already placed entries.
1676  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1677  // some code complexity) into the loop below.
1678  llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1679  return BlockToChain.lookup(BB) == &Chain;
1680  });
1681 
1682  if (WorkList.empty())
1683  return nullptr;
1684 
1685  bool IsEHPad = WorkList[0]->isEHPad();
1686 
1687  MachineBasicBlock *BestBlock = nullptr;
1688  BlockFrequency BestFreq;
1689  for (MachineBasicBlock *MBB : WorkList) {
1690  assert(MBB->isEHPad() == IsEHPad &&
1691  "EHPad mismatch between block and work list.");
1692 
1693  BlockChain &SuccChain = *BlockToChain[MBB];
1694  if (&SuccChain == &Chain)
1695  continue;
1696 
1697  assert(SuccChain.UnscheduledPredecessors == 0 &&
1698  "Found CFG-violating block");
1699 
1700  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1701  LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1702  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1703 
1704  // For ehpad, we layout the least probable first as to avoid jumping back
1705  // from least probable landingpads to more probable ones.
1706  //
1707  // FIXME: Using probability is probably (!) not the best way to achieve
1708  // this. We should probably have a more principled approach to layout
1709  // cleanup code.
1710  //
1711  // The goal is to get:
1712  //
1713  // +--------------------------+
1714  // | V
1715  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1716  //
1717  // Rather than:
1718  //
1719  // +-------------------------------------+
1720  // V |
1721  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1722  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1723  continue;
1724 
1725  BestBlock = MBB;
1726  BestFreq = CandidateFreq;
1727  }
1728 
1729  return BestBlock;
1730 }
1731 
1732 /// Retrieve the first unplaced basic block.
1733 ///
1734 /// This routine is called when we are unable to use the CFG to walk through
1735 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1736 /// We walk through the function's blocks in order, starting from the
1737 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1738 /// re-scanning the entire sequence on repeated calls to this routine.
1739 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1740  const BlockChain &PlacedChain,
1741  MachineFunction::iterator &PrevUnplacedBlockIt,
1742  const BlockFilterSet *BlockFilter) {
1743  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1744  ++I) {
1745  if (BlockFilter && !BlockFilter->count(&*I))
1746  continue;
1747  if (BlockToChain[&*I] != &PlacedChain) {
1748  PrevUnplacedBlockIt = I;
1749  // Now select the head of the chain to which the unplaced block belongs
1750  // as the block to place. This will force the entire chain to be placed,
1751  // and satisfies the requirements of merging chains.
1752  return *BlockToChain[&*I]->begin();
1753  }
1754  }
1755  return nullptr;
1756 }
1757 
1758 void MachineBlockPlacement::fillWorkLists(
1759  const MachineBasicBlock *MBB,
1760  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1761  const BlockFilterSet *BlockFilter = nullptr) {
1762  BlockChain &Chain = *BlockToChain[MBB];
1763  if (!UpdatedPreds.insert(&Chain).second)
1764  return;
1765 
1766  assert(
1767  Chain.UnscheduledPredecessors == 0 &&
1768  "Attempting to place block with unscheduled predecessors in worklist.");
1769  for (MachineBasicBlock *ChainBB : Chain) {
1770  assert(BlockToChain[ChainBB] == &Chain &&
1771  "Block in chain doesn't match BlockToChain map.");
1772  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1773  if (BlockFilter && !BlockFilter->count(Pred))
1774  continue;
1775  if (BlockToChain[Pred] == &Chain)
1776  continue;
1777  ++Chain.UnscheduledPredecessors;
1778  }
1779  }
1780 
1781  if (Chain.UnscheduledPredecessors != 0)
1782  return;
1783 
1784  MachineBasicBlock *BB = *Chain.begin();
1785  if (BB->isEHPad())
1786  EHPadWorkList.push_back(BB);
1787  else
1788  BlockWorkList.push_back(BB);
1789 }
1790 
1791 void MachineBlockPlacement::buildChain(
1792  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1793  BlockFilterSet *BlockFilter) {
1794  assert(HeadBB && "BB must not be null.\n");
1795  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1796  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1797 
1798  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1799  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1800  MachineBasicBlock *BB = *std::prev(Chain.end());
1801  while (true) {
1802  assert(BB && "null block found at end of chain in loop.");
1803  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1804  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1805 
1806 
1807  // Look for the best viable successor if there is one to place immediately
1808  // after this block.
1809  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1810  MachineBasicBlock* BestSucc = Result.BB;
1811  bool ShouldTailDup = Result.ShouldTailDup;
1812  if (allowTailDupPlacement())
1813  ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc,
1814  Chain,
1815  BlockFilter));
1816 
1817  // If an immediate successor isn't available, look for the best viable
1818  // block among those we've identified as not violating the loop's CFG at
1819  // this point. This won't be a fallthrough, but it will increase locality.
1820  if (!BestSucc)
1821  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1822  if (!BestSucc)
1823  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1824 
1825  if (!BestSucc) {
1826  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1827  if (!BestSucc)
1828  break;
1829 
1830  LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1831  "layout successor until the CFG reduces\n");
1832  }
1833 
1834  // Placement may have changed tail duplication opportunities.
1835  // Check for that now.
1836  if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1837  repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1838  BlockFilter, PrevUnplacedBlockIt);
1839  // If the chosen successor was duplicated into BB, don't bother laying
1840  // it out, just go round the loop again with BB as the chain end.
1841  if (!BB->isSuccessor(BestSucc))
1842  continue;
1843  }
1844 
1845  // Place this block, updating the datastructures to reflect its placement.
1846  BlockChain &SuccChain = *BlockToChain[BestSucc];
1847  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1848  // we selected a successor that didn't fit naturally into the CFG.
1849  SuccChain.UnscheduledPredecessors = 0;
1850  LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1851  << getBlockName(BestSucc) << "\n");
1852  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1853  Chain.merge(BestSucc, &SuccChain);
1854  BB = *std::prev(Chain.end());
1855  }
1856 
1857  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1858  << getBlockName(*Chain.begin()) << "\n");
1859 }
1860 
1861 // If bottom of block BB has only one successor OldTop, in most cases it is
1862 // profitable to move it before OldTop, except the following case:
1863 //
1864 // -->OldTop<-
1865 // | . |
1866 // | . |
1867 // | . |
1868 // ---Pred |
1869 // | |
1870 // BB-----
1871 //
1872 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1873 // layout the other successor below it, so it can't reduce taken branch.
1874 // In this case we keep its original layout.
1875 bool
1876 MachineBlockPlacement::canMoveBottomBlockToTop(
1877  const MachineBasicBlock *BottomBlock,
1878  const MachineBasicBlock *OldTop) {
1879  if (BottomBlock->pred_size() != 1)
1880  return true;
1881  MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1882  if (Pred->succ_size() != 2)
1883  return true;
1884 
1885  MachineBasicBlock *OtherBB = *Pred->succ_begin();
1886  if (OtherBB == BottomBlock)
1887  OtherBB = *Pred->succ_rbegin();
1888  if (OtherBB == OldTop)
1889  return false;
1890 
1891  return true;
1892 }
1893 
1894 // Find out the possible fall through frequence to the top of a loop.
1896 MachineBlockPlacement::TopFallThroughFreq(
1897  const MachineBasicBlock *Top,
1898  const BlockFilterSet &LoopBlockSet) {
1899  BlockFrequency MaxFreq = 0;
1900  for (MachineBasicBlock *Pred : Top->predecessors()) {
1901  BlockChain *PredChain = BlockToChain[Pred];
1902  if (!LoopBlockSet.count(Pred) &&
1903  (!PredChain || Pred == *std::prev(PredChain->end()))) {
1904  // Found a Pred block can be placed before Top.
1905  // Check if Top is the best successor of Pred.
1906  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1907  bool TopOK = true;
1908  for (MachineBasicBlock *Succ : Pred->successors()) {
1909  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1910  BlockChain *SuccChain = BlockToChain[Succ];
1911  // Check if Succ can be placed after Pred.
1912  // Succ should not be in any chain, or it is the head of some chain.
1913  if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1914  (!SuccChain || Succ == *SuccChain->begin())) {
1915  TopOK = false;
1916  break;
1917  }
1918  }
1919  if (TopOK) {
1920  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1921  MBPI->getEdgeProbability(Pred, Top);
1922  if (EdgeFreq > MaxFreq)
1923  MaxFreq = EdgeFreq;
1924  }
1925  }
1926  }
1927  return MaxFreq;
1928 }
1929 
1930 // Compute the fall through gains when move NewTop before OldTop.
1931 //
1932 // In following diagram, edges marked as "-" are reduced fallthrough, edges
1933 // marked as "+" are increased fallthrough, this function computes
1934 //
1935 // SUM(increased fallthrough) - SUM(decreased fallthrough)
1936 //
1937 // |
1938 // | -
1939 // V
1940 // --->OldTop
1941 // | .
1942 // | .
1943 // +| . +
1944 // | Pred --->
1945 // | |-
1946 // | V
1947 // --- NewTop <---
1948 // |-
1949 // V
1950 //
1952 MachineBlockPlacement::FallThroughGains(
1953  const MachineBasicBlock *NewTop,
1954  const MachineBasicBlock *OldTop,
1955  const MachineBasicBlock *ExitBB,
1956  const BlockFilterSet &LoopBlockSet) {
1957  BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
1958  BlockFrequency FallThrough2Exit = 0;
1959  if (ExitBB)
1960  FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
1961  MBPI->getEdgeProbability(NewTop, ExitBB);
1962  BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
1963  MBPI->getEdgeProbability(NewTop, OldTop);
1964 
1965  // Find the best Pred of NewTop.
1966  MachineBasicBlock *BestPred = nullptr;
1967  BlockFrequency FallThroughFromPred = 0;
1968  for (MachineBasicBlock *Pred : NewTop->predecessors()) {
1969  if (!LoopBlockSet.count(Pred))
1970  continue;
1971  BlockChain *PredChain = BlockToChain[Pred];
1972  if (!PredChain || Pred == *std::prev(PredChain->end())) {
1973  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1974  MBPI->getEdgeProbability(Pred, NewTop);
1975  if (EdgeFreq > FallThroughFromPred) {
1976  FallThroughFromPred = EdgeFreq;
1977  BestPred = Pred;
1978  }
1979  }
1980  }
1981 
1982  // If NewTop is not placed after Pred, another successor can be placed
1983  // after Pred.
1984  BlockFrequency NewFreq = 0;
1985  if (BestPred) {
1986  for (MachineBasicBlock *Succ : BestPred->successors()) {
1987  if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
1988  continue;
1989  if (ComputedEdges.find(Succ) != ComputedEdges.end())
1990  continue;
1991  BlockChain *SuccChain = BlockToChain[Succ];
1992  if ((SuccChain && (Succ != *SuccChain->begin())) ||
1993  (SuccChain == BlockToChain[BestPred]))
1994  continue;
1995  BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
1996  MBPI->getEdgeProbability(BestPred, Succ);
1997  if (EdgeFreq > NewFreq)
1998  NewFreq = EdgeFreq;
1999  }
2000  BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2001  MBPI->getEdgeProbability(BestPred, NewTop);
2002  if (NewFreq > OrigEdgeFreq) {
2003  // If NewTop is not the best successor of Pred, then Pred doesn't
2004  // fallthrough to NewTop. So there is no FallThroughFromPred and
2005  // NewFreq.
2006  NewFreq = 0;
2007  FallThroughFromPred = 0;
2008  }
2009  }
2010 
2011  BlockFrequency Result = 0;
2012  BlockFrequency Gains = BackEdgeFreq + NewFreq;
2013  BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
2014  FallThroughFromPred;
2015  if (Gains > Lost)
2016  Result = Gains - Lost;
2017  return Result;
2018 }
2019 
2020 /// Helper function of findBestLoopTop. Find the best loop top block
2021 /// from predecessors of old top.
2022 ///
2023 /// Look for a block which is strictly better than the old top for laying
2024 /// out before the old top of the loop. This looks for only two patterns:
2025 ///
2026 /// 1. a block has only one successor, the old loop top
2027 ///
2028 /// Because such a block will always result in an unconditional jump,
2029 /// rotating it in front of the old top is always profitable.
2030 ///
2031 /// 2. a block has two successors, one is old top, another is exit
2032 /// and it has more than one predecessors
2033 ///
2034 /// If it is below one of its predecessors P, only P can fall through to
2035 /// it, all other predecessors need a jump to it, and another conditional
2036 /// jump to loop header. If it is moved before loop header, all its
2037 /// predecessors jump to it, then fall through to loop header. So all its
2038 /// predecessors except P can reduce one taken branch.
2039 /// At the same time, move it before old top increases the taken branch
2040 /// to loop exit block, so the reduced taken branch will be compared with
2041 /// the increased taken branch to the loop exit block.
2043 MachineBlockPlacement::findBestLoopTopHelper(
2044  MachineBasicBlock *OldTop,
2045  const MachineLoop &L,
2046  const BlockFilterSet &LoopBlockSet) {
2047  // Check that the header hasn't been fused with a preheader block due to
2048  // crazy branches. If it has, we need to start with the header at the top to
2049  // prevent pulling the preheader into the loop body.
2050  BlockChain &HeaderChain = *BlockToChain[OldTop];
2051  if (!LoopBlockSet.count(*HeaderChain.begin()))
2052  return OldTop;
2053 
2054  LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2055  << "\n");
2056 
2057  BlockFrequency BestGains = 0;
2058  MachineBasicBlock *BestPred = nullptr;
2059  for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2060  if (!LoopBlockSet.count(Pred))
2061  continue;
2062  if (Pred == L.getHeader())
2063  continue;
2064  LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2065  << Pred->succ_size() << " successors, ";
2066  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
2067  if (Pred->succ_size() > 2)
2068  continue;
2069 
2070  MachineBasicBlock *OtherBB = nullptr;
2071  if (Pred->succ_size() == 2) {
2072  OtherBB = *Pred->succ_begin();
2073  if (OtherBB == OldTop)
2074  OtherBB = *Pred->succ_rbegin();
2075  }
2076 
2077  if (!canMoveBottomBlockToTop(Pred, OldTop))
2078  continue;
2079 
2080  BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
2081  LoopBlockSet);
2082  if ((Gains > 0) && (Gains > BestGains ||
2083  ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2084  BestPred = Pred;
2085  BestGains = Gains;
2086  }
2087  }
2088 
2089  // If no direct predecessor is fine, just use the loop header.
2090  if (!BestPred) {
2091  LLVM_DEBUG(dbgs() << " final top unchanged\n");
2092  return OldTop;
2093  }
2094 
2095  // Walk backwards through any straight line of predecessors.
2096  while (BestPred->pred_size() == 1 &&
2097  (*BestPred->pred_begin())->succ_size() == 1 &&
2098  *BestPred->pred_begin() != L.getHeader())
2099  BestPred = *BestPred->pred_begin();
2100 
2101  LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2102  return BestPred;
2103 }
2104 
2105 /// Find the best loop top block for layout.
2106 ///
2107 /// This function iteratively calls findBestLoopTopHelper, until no new better
2108 /// BB can be found.
2110 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2111  const BlockFilterSet &LoopBlockSet) {
2112  // Placing the latch block before the header may introduce an extra branch
2113  // that skips this block the first time the loop is executed, which we want
2114  // to avoid when optimising for size.
2115  // FIXME: in theory there is a case that does not introduce a new branch,
2116  // i.e. when the layout predecessor does not fallthrough to the loop header.
2117  // In practice this never happens though: there always seems to be a preheader
2118  // that can fallthrough and that is also placed before the header.
2119  bool OptForSize = F->getFunction().hasOptSize() ||
2120  llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());
2121  if (OptForSize)
2122  return L.getHeader();
2123 
2124  MachineBasicBlock *OldTop = nullptr;
2125  MachineBasicBlock *NewTop = L.getHeader();
2126  while (NewTop != OldTop) {
2127  OldTop = NewTop;
2128  NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2129  if (NewTop != OldTop)
2130  ComputedEdges[NewTop] = { OldTop, false };
2131  }
2132  return NewTop;
2133 }
2134 
2135 /// Find the best loop exiting block for layout.
2136 ///
2137 /// This routine implements the logic to analyze the loop looking for the best
2138 /// block to layout at the top of the loop. Typically this is done to maximize
2139 /// fallthrough opportunities.
2141 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2142  const BlockFilterSet &LoopBlockSet,
2143  BlockFrequency &ExitFreq) {
2144  // We don't want to layout the loop linearly in all cases. If the loop header
2145  // is just a normal basic block in the loop, we want to look for what block
2146  // within the loop is the best one to layout at the top. However, if the loop
2147  // header has be pre-merged into a chain due to predecessors not having
2148  // analyzable branches, *and* the predecessor it is merged with is *not* part
2149  // of the loop, rotating the header into the middle of the loop will create
2150  // a non-contiguous range of blocks which is Very Bad. So start with the
2151  // header and only rotate if safe.
2152  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2153  if (!LoopBlockSet.count(*HeaderChain.begin()))
2154  return nullptr;
2155 
2156  BlockFrequency BestExitEdgeFreq;
2157  unsigned BestExitLoopDepth = 0;
2158  MachineBasicBlock *ExitingBB = nullptr;
2159  // If there are exits to outer loops, loop rotation can severely limit
2160  // fallthrough opportunities unless it selects such an exit. Keep a set of
2161  // blocks where rotating to exit with that block will reach an outer loop.
2162  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2163 
2164  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2165  << getBlockName(L.getHeader()) << "\n");
2166  for (MachineBasicBlock *MBB : L.getBlocks()) {
2167  BlockChain &Chain = *BlockToChain[MBB];
2168  // Ensure that this block is at the end of a chain; otherwise it could be
2169  // mid-way through an inner loop or a successor of an unanalyzable branch.
2170  if (MBB != *std::prev(Chain.end()))
2171  continue;
2172 
2173  // Now walk the successors. We need to establish whether this has a viable
2174  // exiting successor and whether it has a viable non-exiting successor.
2175  // We store the old exiting state and restore it if a viable looping
2176  // successor isn't found.
2177  MachineBasicBlock *OldExitingBB = ExitingBB;
2178  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2179  bool HasLoopingSucc = false;
2180  for (MachineBasicBlock *Succ : MBB->successors()) {
2181  if (Succ->isEHPad())
2182  continue;
2183  if (Succ == MBB)
2184  continue;
2185  BlockChain &SuccChain = *BlockToChain[Succ];
2186  // Don't split chains, either this chain or the successor's chain.
2187  if (&Chain == &SuccChain) {
2188  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2189  << getBlockName(Succ) << " (chain conflict)\n");
2190  continue;
2191  }
2192 
2193  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2194  if (LoopBlockSet.count(Succ)) {
2195  LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2196  << getBlockName(Succ) << " (" << SuccProb << ")\n");
2197  HasLoopingSucc = true;
2198  continue;
2199  }
2200 
2201  unsigned SuccLoopDepth = 0;
2202  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2203  SuccLoopDepth = ExitLoop->getLoopDepth();
2204  if (ExitLoop->contains(&L))
2205  BlocksExitingToOuterLoop.insert(MBB);
2206  }
2207 
2208  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2209  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2210  << getBlockName(Succ) << " [L:" << SuccLoopDepth
2211  << "] (";
2212  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
2213  // Note that we bias this toward an existing layout successor to retain
2214  // incoming order in the absence of better information. The exit must have
2215  // a frequency higher than the current exit before we consider breaking
2216  // the layout.
2217  BranchProbability Bias(100 - ExitBlockBias, 100);
2218  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2219  ExitEdgeFreq > BestExitEdgeFreq ||
2220  (MBB->isLayoutSuccessor(Succ) &&
2221  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2222  BestExitEdgeFreq = ExitEdgeFreq;
2223  ExitingBB = MBB;
2224  }
2225  }
2226 
2227  if (!HasLoopingSucc) {
2228  // Restore the old exiting state, no viable looping successor was found.
2229  ExitingBB = OldExitingBB;
2230  BestExitEdgeFreq = OldBestExitEdgeFreq;
2231  }
2232  }
2233  // Without a candidate exiting block or with only a single block in the
2234  // loop, just use the loop header to layout the loop.
2235  if (!ExitingBB) {
2236  LLVM_DEBUG(
2237  dbgs() << " No other candidate exit blocks, using loop header\n");
2238  return nullptr;
2239  }
2240  if (L.getNumBlocks() == 1) {
2241  LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2242  return nullptr;
2243  }
2244 
2245  // Also, if we have exit blocks which lead to outer loops but didn't select
2246  // one of them as the exiting block we are rotating toward, disable loop
2247  // rotation altogether.
2248  if (!BlocksExitingToOuterLoop.empty() &&
2249  !BlocksExitingToOuterLoop.count(ExitingBB))
2250  return nullptr;
2251 
2252  LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2253  << "\n");
2254  ExitFreq = BestExitEdgeFreq;
2255  return ExitingBB;
2256 }
2257 
2258 /// Check if there is a fallthrough to loop header Top.
2259 ///
2260 /// 1. Look for a Pred that can be layout before Top.
2261 /// 2. Check if Top is the most possible successor of Pred.
2262 bool
2263 MachineBlockPlacement::hasViableTopFallthrough(
2264  const MachineBasicBlock *Top,
2265  const BlockFilterSet &LoopBlockSet) {
2266  for (MachineBasicBlock *Pred : Top->predecessors()) {
2267  BlockChain *PredChain = BlockToChain[Pred];
2268  if (!LoopBlockSet.count(Pred) &&
2269  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2270  // Found a Pred block can be placed before Top.
2271  // Check if Top is the best successor of Pred.
2272  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2273  bool TopOK = true;
2274  for (MachineBasicBlock *Succ : Pred->successors()) {
2275  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2276  BlockChain *SuccChain = BlockToChain[Succ];
2277  // Check if Succ can be placed after Pred.
2278  // Succ should not be in any chain, or it is the head of some chain.
2279  if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2280  TopOK = false;
2281  break;
2282  }
2283  }
2284  if (TopOK)
2285  return true;
2286  }
2287  }
2288  return false;
2289 }
2290 
2291 /// Attempt to rotate an exiting block to the bottom of the loop.
2292 ///
2293 /// Once we have built a chain, try to rotate it to line up the hot exit block
2294 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2295 /// branches. For example, if the loop has fallthrough into its header and out
2296 /// of its bottom already, don't rotate it.
2297 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2298  const MachineBasicBlock *ExitingBB,
2299  BlockFrequency ExitFreq,
2300  const BlockFilterSet &LoopBlockSet) {
2301  if (!ExitingBB)
2302  return;
2303 
2304  MachineBasicBlock *Top = *LoopChain.begin();
2305  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2306 
2307  // If ExitingBB is already the last one in a chain then nothing to do.
2308  if (Bottom == ExitingBB)
2309  return;
2310 
2311  // The entry block should always be the first BB in a function.
2312  if (Top->isEntryBlock())
2313  return;
2314 
2315  bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2316 
2317  // If the header has viable fallthrough, check whether the current loop
2318  // bottom is a viable exiting block. If so, bail out as rotating will
2319  // introduce an unnecessary branch.
2320  if (ViableTopFallthrough) {
2321  for (MachineBasicBlock *Succ : Bottom->successors()) {
2322  BlockChain *SuccChain = BlockToChain[Succ];
2323  if (!LoopBlockSet.count(Succ) &&
2324  (!SuccChain || Succ == *SuccChain->begin()))
2325  return;
2326  }
2327 
2328  // Rotate will destroy the top fallthrough, we need to ensure the new exit
2329  // frequency is larger than top fallthrough.
2330  BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2331  if (FallThrough2Top >= ExitFreq)
2332  return;
2333  }
2334 
2335  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2336  if (ExitIt == LoopChain.end())
2337  return;
2338 
2339  // Rotating a loop exit to the bottom when there is a fallthrough to top
2340  // trades the entry fallthrough for an exit fallthrough.
2341  // If there is no bottom->top edge, but the chosen exit block does have
2342  // a fallthrough, we break that fallthrough for nothing in return.
2343 
2344  // Let's consider an example. We have a built chain of basic blocks
2345  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2346  // By doing a rotation we get
2347  // Bk+1, ..., Bn, B1, ..., Bk
2348  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2349  // If we had a fallthrough Bk -> Bk+1 it is broken now.
2350  // It might be compensated by fallthrough Bn -> B1.
2351  // So we have a condition to avoid creation of extra branch by loop rotation.
2352  // All below must be true to avoid loop rotation:
2353  // If there is a fallthrough to top (B1)
2354  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2355  // There is no fallthrough from bottom (Bn) to top (B1).
2356  // Please note that there is no exit fallthrough from Bn because we checked it
2357  // above.
2358  if (ViableTopFallthrough) {
2359  assert(std::next(ExitIt) != LoopChain.end() &&
2360  "Exit should not be last BB");
2361  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2362  if (ExitingBB->isSuccessor(NextBlockInChain))
2363  if (!Bottom->isSuccessor(Top))
2364  return;
2365  }
2366 
2367  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2368  << " at bottom\n");
2369  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2370 }
2371 
2372 /// Attempt to rotate a loop based on profile data to reduce branch cost.
2373 ///
2374 /// With profile data, we can determine the cost in terms of missed fall through
2375 /// opportunities when rotating a loop chain and select the best rotation.
2376 /// Basically, there are three kinds of cost to consider for each rotation:
2377 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2378 /// the loop to the loop header.
2379 /// 2. The possibly missed fall through edges (if they exist) from the loop
2380 /// exits to BB out of the loop.
2381 /// 3. The missed fall through edge (if it exists) from the last BB to the
2382 /// first BB in the loop chain.
2383 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2384 /// We select the best rotation with the smallest cost.
2385 void MachineBlockPlacement::rotateLoopWithProfile(
2386  BlockChain &LoopChain, const MachineLoop &L,
2387  const BlockFilterSet &LoopBlockSet) {
2388  auto RotationPos = LoopChain.end();
2389  MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2390 
2391  // The entry block should always be the first BB in a function.
2392  if (ChainHeaderBB->isEntryBlock())
2393  return;
2394 
2395  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2396 
2397  // A utility lambda that scales up a block frequency by dividing it by a
2398  // branch probability which is the reciprocal of the scale.
2399  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2400  unsigned Scale) -> BlockFrequency {
2401  if (Scale == 0)
2402  return 0;
2403  // Use operator / between BlockFrequency and BranchProbability to implement
2404  // saturating multiplication.
2405  return Freq / BranchProbability(1, Scale);
2406  };
2407 
2408  // Compute the cost of the missed fall-through edge to the loop header if the
2409  // chain head is not the loop header. As we only consider natural loops with
2410  // single header, this computation can be done only once.
2411  BlockFrequency HeaderFallThroughCost(0);
2412  for (auto *Pred : ChainHeaderBB->predecessors()) {
2413  BlockChain *PredChain = BlockToChain[Pred];
2414  if (!LoopBlockSet.count(Pred) &&
2415  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2416  auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2417  MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2418  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2419  // If the predecessor has only an unconditional jump to the header, we
2420  // need to consider the cost of this jump.
2421  if (Pred->succ_size() == 1)
2422  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2423  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2424  }
2425  }
2426 
2427  // Here we collect all exit blocks in the loop, and for each exit we find out
2428  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2429  // as the sum of frequencies of exit edges we collect here, excluding the exit
2430  // edge from the tail of the loop chain.
2432  for (auto BB : LoopChain) {
2433  auto LargestExitEdgeProb = BranchProbability::getZero();
2434  for (auto *Succ : BB->successors()) {
2435  BlockChain *SuccChain = BlockToChain[Succ];
2436  if (!LoopBlockSet.count(Succ) &&
2437  (!SuccChain || Succ == *SuccChain->begin())) {
2438  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2439  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2440  }
2441  }
2442  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2443  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2444  ExitsWithFreq.emplace_back(BB, ExitFreq);
2445  }
2446  }
2447 
2448  // In this loop we iterate every block in the loop chain and calculate the
2449  // cost assuming the block is the head of the loop chain. When the loop ends,
2450  // we should have found the best candidate as the loop chain's head.
2451  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2452  EndIter = LoopChain.end();
2453  Iter != EndIter; Iter++, TailIter++) {
2454  // TailIter is used to track the tail of the loop chain if the block we are
2455  // checking (pointed by Iter) is the head of the chain.
2456  if (TailIter == LoopChain.end())
2457  TailIter = LoopChain.begin();
2458 
2459  auto TailBB = *TailIter;
2460 
2461  // Calculate the cost by putting this BB to the top.
2462  BlockFrequency Cost = 0;
2463 
2464  // If the current BB is the loop header, we need to take into account the
2465  // cost of the missed fall through edge from outside of the loop to the
2466  // header.
2467  if (Iter != LoopChain.begin())
2468  Cost += HeaderFallThroughCost;
2469 
2470  // Collect the loop exit cost by summing up frequencies of all exit edges
2471  // except the one from the chain tail.
2472  for (auto &ExitWithFreq : ExitsWithFreq)
2473  if (TailBB != ExitWithFreq.first)
2474  Cost += ExitWithFreq.second;
2475 
2476  // The cost of breaking the once fall-through edge from the tail to the top
2477  // of the loop chain. Here we need to consider three cases:
2478  // 1. If the tail node has only one successor, then we will get an
2479  // additional jmp instruction. So the cost here is (MisfetchCost +
2480  // JumpInstCost) * tail node frequency.
2481  // 2. If the tail node has two successors, then we may still get an
2482  // additional jmp instruction if the layout successor after the loop
2483  // chain is not its CFG successor. Note that the more frequently executed
2484  // jmp instruction will be put ahead of the other one. Assume the
2485  // frequency of those two branches are x and y, where x is the frequency
2486  // of the edge to the chain head, then the cost will be
2487  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2488  // 3. If the tail node has more than two successors (this rarely happens),
2489  // we won't consider any additional cost.
2490  if (TailBB->isSuccessor(*Iter)) {
2491  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2492  if (TailBB->succ_size() == 1)
2493  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2495  else if (TailBB->succ_size() == 2) {
2496  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2497  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2498  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2499  ? TailBBFreq * TailToHeadProb.getCompl()
2500  : TailToHeadFreq;
2501  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2502  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2503  }
2504  }
2505 
2506  LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2507  << getBlockName(*Iter)
2508  << " to the top: " << Cost.getFrequency() << "\n");
2509 
2510  if (Cost < SmallestRotationCost) {
2511  SmallestRotationCost = Cost;
2512  RotationPos = Iter;
2513  }
2514  }
2515 
2516  if (RotationPos != LoopChain.end()) {
2517  LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2518  << " to the top\n");
2519  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2520  }
2521 }
2522 
2523 /// Collect blocks in the given loop that are to be placed.
2524 ///
2525 /// When profile data is available, exclude cold blocks from the returned set;
2526 /// otherwise, collect all blocks in the loop.
2528 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2529  BlockFilterSet LoopBlockSet;
2530 
2531  // Filter cold blocks off from LoopBlockSet when profile data is available.
2532  // Collect the sum of frequencies of incoming edges to the loop header from
2533  // outside. If we treat the loop as a super block, this is the frequency of
2534  // the loop. Then for each block in the loop, we calculate the ratio between
2535  // its frequency and the frequency of the loop block. When it is too small,
2536  // don't add it to the loop chain. If there are outer loops, then this block
2537  // will be merged into the first outer loop chain for which this block is not
2538  // cold anymore. This needs precise profile data and we only do this when
2539  // profile data is available.
2540  if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2541  BlockFrequency LoopFreq(0);
2542  for (auto LoopPred : L.getHeader()->predecessors())
2543  if (!L.contains(LoopPred))
2544  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2545  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2546 
2547  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2548  if (LoopBlockSet.count(LoopBB))
2549  continue;
2550  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2551  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2552  continue;
2553  BlockChain *Chain = BlockToChain[LoopBB];
2554  for (MachineBasicBlock *ChainBB : *Chain)
2555  LoopBlockSet.insert(ChainBB);
2556  }
2557  } else
2558  LoopBlockSet.insert(L.block_begin(), L.block_end());
2559 
2560  return LoopBlockSet;
2561 }
2562 
2563 /// Forms basic block chains from the natural loop structures.
2564 ///
2565 /// These chains are designed to preserve the existing *structure* of the code
2566 /// as much as possible. We can then stitch the chains together in a way which
2567 /// both preserves the topological structure and minimizes taken conditional
2568 /// branches.
2569 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2570  // First recurse through any nested loops, building chains for those inner
2571  // loops.
2572  for (const MachineLoop *InnerLoop : L)
2573  buildLoopChains(*InnerLoop);
2574 
2575  assert(BlockWorkList.empty() &&
2576  "BlockWorkList not empty when starting to build loop chains.");
2577  assert(EHPadWorkList.empty() &&
2578  "EHPadWorkList not empty when starting to build loop chains.");
2579  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2580 
2581  // Check if we have profile data for this function. If yes, we will rotate
2582  // this loop by modeling costs more precisely which requires the profile data
2583  // for better layout.
2584  bool RotateLoopWithProfile =
2586  (PreciseRotationCost && F->getFunction().hasProfileData());
2587 
2588  // First check to see if there is an obviously preferable top block for the
2589  // loop. This will default to the header, but may end up as one of the
2590  // predecessors to the header if there is one which will result in strictly
2591  // fewer branches in the loop body.
2592  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2593 
2594  // If we selected just the header for the loop top, look for a potentially
2595  // profitable exit block in the event that rotating the loop can eliminate
2596  // branches by placing an exit edge at the bottom.
2597  //
2598  // Loops are processed innermost to uttermost, make sure we clear
2599  // PreferredLoopExit before processing a new loop.
2600  PreferredLoopExit = nullptr;
2601  BlockFrequency ExitFreq;
2602  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2603  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2604 
2605  BlockChain &LoopChain = *BlockToChain[LoopTop];
2606 
2607  // FIXME: This is a really lame way of walking the chains in the loop: we
2608  // walk the blocks, and use a set to prevent visiting a particular chain
2609  // twice.
2610  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2611  assert(LoopChain.UnscheduledPredecessors == 0 &&
2612  "LoopChain should not have unscheduled predecessors.");
2613  UpdatedPreds.insert(&LoopChain);
2614 
2615  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2616  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2617 
2618  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2619 
2620  if (RotateLoopWithProfile)
2621  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2622  else
2623  rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2624 
2625  LLVM_DEBUG({
2626  // Crash at the end so we get all of the debugging output first.
2627  bool BadLoop = false;
2628  if (LoopChain.UnscheduledPredecessors) {
2629  BadLoop = true;
2630  dbgs() << "Loop chain contains a block without its preds placed!\n"
2631  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2632  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2633  }
2634  for (MachineBasicBlock *ChainBB : LoopChain) {
2635  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2636  if (!LoopBlockSet.remove(ChainBB)) {
2637  // We don't mark the loop as bad here because there are real situations
2638  // where this can occur. For example, with an unanalyzable fallthrough
2639  // from a loop block to a non-loop block or vice versa.
2640  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2641  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2642  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2643  << " Bad block: " << getBlockName(ChainBB) << "\n";
2644  }
2645  }
2646 
2647  if (!LoopBlockSet.empty()) {
2648  BadLoop = true;
2649  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2650  dbgs() << "Loop contains blocks never placed into a chain!\n"
2651  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2652  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2653  << " Bad block: " << getBlockName(LoopBB) << "\n";
2654  }
2655  assert(!BadLoop && "Detected problems with the placement of this loop.");
2656  });
2657 
2658  BlockWorkList.clear();
2659  EHPadWorkList.clear();
2660 }
2661 
2662 void MachineBlockPlacement::buildCFGChains() {
2663  // Ensure that every BB in the function has an associated chain to simplify
2664  // the assumptions of the remaining algorithm.
2665  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2666  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2667  ++FI) {
2668  MachineBasicBlock *BB = &*FI;
2669  BlockChain *Chain =
2670  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2671  // Also, merge any blocks which we cannot reason about and must preserve
2672  // the exact fallthrough behavior for.
2673  while (true) {
2674  Cond.clear();
2675  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2676  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2677  break;
2678 
2679  MachineFunction::iterator NextFI = std::next(FI);
2680  MachineBasicBlock *NextBB = &*NextFI;
2681  // Ensure that the layout successor is a viable block, as we know that
2682  // fallthrough is a possibility.
2683  assert(NextFI != FE && "Can't fallthrough past the last block.");
2684  LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2685  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2686  << "\n");
2687  Chain->merge(NextBB, nullptr);
2688 #ifndef NDEBUG
2689  BlocksWithUnanalyzableExits.insert(&*BB);
2690 #endif
2691  FI = NextFI;
2692  BB = NextBB;
2693  }
2694  }
2695 
2696  // Build any loop-based chains.
2697  PreferredLoopExit = nullptr;
2698  for (MachineLoop *L : *MLI)
2699  buildLoopChains(*L);
2700 
2701  assert(BlockWorkList.empty() &&
2702  "BlockWorkList should be empty before building final chain.");
2703  assert(EHPadWorkList.empty() &&
2704  "EHPadWorkList should be empty before building final chain.");
2705 
2706  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2707  for (MachineBasicBlock &MBB : *F)
2708  fillWorkLists(&MBB, UpdatedPreds);
2709 
2710  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2711  buildChain(&F->front(), FunctionChain);
2712 
2713 #ifndef NDEBUG
2714  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2715 #endif
2716  LLVM_DEBUG({
2717  // Crash at the end so we get all of the debugging output first.
2718  bool BadFunc = false;
2719  FunctionBlockSetType FunctionBlockSet;
2720  for (MachineBasicBlock &MBB : *F)
2721  FunctionBlockSet.insert(&MBB);
2722 
2723  for (MachineBasicBlock *ChainBB : FunctionChain)
2724  if (!FunctionBlockSet.erase(ChainBB)) {
2725  BadFunc = true;
2726  dbgs() << "Function chain contains a block not in the function!\n"
2727  << " Bad block: " << getBlockName(ChainBB) << "\n";
2728  }
2729 
2730  if (!FunctionBlockSet.empty()) {
2731  BadFunc = true;
2732  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2733  dbgs() << "Function contains blocks never placed into a chain!\n"
2734  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2735  }
2736  assert(!BadFunc && "Detected problems with the block placement.");
2737  });
2738 
2739  // Remember original layout ordering, so we can update terminators after
2740  // reordering to point to the original layout successor.
2741  SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2742  F->getNumBlockIDs());
2743  {
2744  MachineBasicBlock *LastMBB = nullptr;
2745  for (auto &MBB : *F) {
2746  if (LastMBB != nullptr)
2747  OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2748  LastMBB = &MBB;
2749  }
2750  OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2751  }
2752 
2753  // Splice the blocks into place.
2754  MachineFunction::iterator InsertPos = F->begin();
2755  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2756  for (MachineBasicBlock *ChainBB : FunctionChain) {
2757  LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2758  : " ... ")
2759  << getBlockName(ChainBB) << "\n");
2760  if (InsertPos != MachineFunction::iterator(ChainBB))
2761  F->splice(InsertPos, ChainBB);
2762  else
2763  ++InsertPos;
2764 
2765  // Update the terminator of the previous block.
2766  if (ChainBB == *FunctionChain.begin())
2767  continue;
2768  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2769 
2770  // FIXME: It would be awesome of updateTerminator would just return rather
2771  // than assert when the branch cannot be analyzed in order to remove this
2772  // boiler plate.
2773  Cond.clear();
2774  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2775 
2776 #ifndef NDEBUG
2777  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2778  // Given the exact block placement we chose, we may actually not _need_ to
2779  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2780  // do that at this point is a bug.
2781  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2782  !PrevBB->canFallThrough()) &&
2783  "Unexpected block with un-analyzable fallthrough!");
2784  Cond.clear();
2785  TBB = FBB = nullptr;
2786  }
2787 #endif
2788 
2789  // The "PrevBB" is not yet updated to reflect current code layout, so,
2790  // o. it may fall-through to a block without explicit "goto" instruction
2791  // before layout, and no longer fall-through it after layout; or
2792  // o. just opposite.
2793  //
2794  // analyzeBranch() may return erroneous value for FBB when these two
2795  // situations take place. For the first scenario FBB is mistakenly set NULL;
2796  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2797  // mistakenly pointing to "*BI".
2798  // Thus, if the future change needs to use FBB before the layout is set, it
2799  // has to correct FBB first by using the code similar to the following:
2800  //
2801  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2802  // PrevBB->updateTerminator();
2803  // Cond.clear();
2804  // TBB = FBB = nullptr;
2805  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2806  // // FIXME: This should never take place.
2807  // TBB = FBB = nullptr;
2808  // }
2809  // }
2810  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2811  PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2812  }
2813  }
2814 
2815  // Fixup the last block.
2816  Cond.clear();
2817  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2818  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2819  MachineBasicBlock *PrevBB = &F->back();
2820  PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2821  }
2822 
2823  BlockWorkList.clear();
2824  EHPadWorkList.clear();
2825 }
2826 
2827 void MachineBlockPlacement::optimizeBranches() {
2828  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2829  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2830 
2831  // Now that all the basic blocks in the chain have the proper layout,
2832  // make a final call to analyzeBranch with AllowModify set.
2833  // Indeed, the target may be able to optimize the branches in a way we
2834  // cannot because all branches may not be analyzable.
2835  // E.g., the target may be able to remove an unconditional branch to
2836  // a fallthrough when it occurs after predicated terminators.
2837  for (MachineBasicBlock *ChainBB : FunctionChain) {
2838  Cond.clear();
2839  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2840  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2841  // If PrevBB has a two-way branch, try to re-order the branches
2842  // such that we branch to the successor with higher probability first.
2843  if (TBB && !Cond.empty() && FBB &&
2844  MBPI->getEdgeProbability(ChainBB, FBB) >
2845  MBPI->getEdgeProbability(ChainBB, TBB) &&
2847  LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2848  << getBlockName(ChainBB) << "\n");
2849  LLVM_DEBUG(dbgs() << " Edge probability: "
2850  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2851  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2852  DebugLoc dl; // FIXME: this is nowhere
2853  TII->removeBranch(*ChainBB);
2854  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2855  }
2856  }
2857  }
2858 }
2859 
2860 void MachineBlockPlacement::alignBlocks() {
2861  // Walk through the backedges of the function now that we have fully laid out
2862  // the basic blocks and align the destination of each backedge. We don't rely
2863  // exclusively on the loop info here so that we can align backedges in
2864  // unnatural CFGs and backedges that were introduced purely because of the
2865  // loop rotations done during this layout pass.
2866  if (F->getFunction().hasMinSize() ||
2867  (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2868  return;
2869  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2870  if (FunctionChain.begin() == FunctionChain.end())
2871  return; // Empty chain.
2872 
2873  const BranchProbability ColdProb(1, 5); // 20%
2874  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2875  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2876  for (MachineBasicBlock *ChainBB : FunctionChain) {
2877  if (ChainBB == *FunctionChain.begin())
2878  continue;
2879 
2880  // Don't align non-looping basic blocks. These are unlikely to execute
2881  // enough times to matter in practice. Note that we'll still handle
2882  // unnatural CFGs inside of a natural outer loop (the common case) and
2883  // rotated loops.
2884  MachineLoop *L = MLI->getLoopFor(ChainBB);
2885  if (!L)
2886  continue;
2887 
2888  const Align Align = TLI->getPrefLoopAlignment(L);
2889  if (Align == 1)
2890  continue; // Don't care about loop alignment.
2891 
2892  // If the block is cold relative to the function entry don't waste space
2893  // aligning it.
2894  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2895  if (Freq < WeightedEntryFreq)
2896  continue;
2897 
2898  // If the block is cold relative to its loop header, don't align it
2899  // regardless of what edges into the block exist.
2900  MachineBasicBlock *LoopHeader = L->getHeader();
2901  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2902  if (Freq < (LoopHeaderFreq * ColdProb))
2903  continue;
2904 
2905  // If the global profiles indicates so, don't align it.
2906  if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
2907  !TLI->alignLoopsWithOptSize())
2908  continue;
2909 
2910  // Check for the existence of a non-layout predecessor which would benefit
2911  // from aligning this block.
2912  MachineBasicBlock *LayoutPred =
2913  &*std::prev(MachineFunction::iterator(ChainBB));
2914 
2915  // Force alignment if all the predecessors are jumps. We already checked
2916  // that the block isn't cold above.
2917  if (!LayoutPred->isSuccessor(ChainBB)) {
2918  ChainBB->setAlignment(Align);
2919  continue;
2920  }
2921 
2922  // Align this block if the layout predecessor's edge into this block is
2923  // cold relative to the block. When this is true, other predecessors make up
2924  // all of the hot entries into the block and thus alignment is likely to be
2925  // important.
2926  BranchProbability LayoutProb =
2927  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2928  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2929  if (LayoutEdgeFreq <= (Freq * ColdProb))
2930  ChainBB->setAlignment(Align);
2931  }
2932 }
2933 
2934 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2935 /// it was duplicated into its chain predecessor and removed.
2936 /// \p BB - Basic block that may be duplicated.
2937 ///
2938 /// \p LPred - Chosen layout predecessor of \p BB.
2939 /// Updated to be the chain end if LPred is removed.
2940 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2941 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2942 /// Used to identify which blocks to update predecessor
2943 /// counts.
2944 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2945 /// chosen in the given order due to unnatural CFG
2946 /// only needed if \p BB is removed and
2947 /// \p PrevUnplacedBlockIt pointed to \p BB.
2948 /// @return true if \p BB was removed.
2949 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2951  const MachineBasicBlock *LoopHeaderBB,
2952  BlockChain &Chain, BlockFilterSet *BlockFilter,
2953  MachineFunction::iterator &PrevUnplacedBlockIt) {
2954  bool Removed, DuplicatedToLPred;
2955  bool DuplicatedToOriginalLPred;
2956  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
2957  PrevUnplacedBlockIt,
2958  DuplicatedToLPred);
2959  if (!Removed)
2960  return false;
2961  DuplicatedToOriginalLPred = DuplicatedToLPred;
2962  // Iteratively try to duplicate again. It can happen that a block that is
2963  // duplicated into is still small enough to be duplicated again.
2964  // No need to call markBlockSuccessors in this case, as the blocks being
2965  // duplicated from here on are already scheduled.
2966  while (DuplicatedToLPred && Removed) {
2967  MachineBasicBlock *DupBB, *DupPred;
2968  // The removal callback causes Chain.end() to be updated when a block is
2969  // removed. On the first pass through the loop, the chain end should be the
2970  // same as it was on function entry. On subsequent passes, because we are
2971  // duplicating the block at the end of the chain, if it is removed the
2972  // chain will have shrunk by one block.
2973  BlockChain::iterator ChainEnd = Chain.end();
2974  DupBB = *(--ChainEnd);
2975  // Now try to duplicate again.
2976  if (ChainEnd == Chain.begin())
2977  break;
2978  DupPred = *std::prev(ChainEnd);
2979  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
2980  PrevUnplacedBlockIt,
2981  DuplicatedToLPred);
2982  }
2983  // If BB was duplicated into LPred, it is now scheduled. But because it was
2984  // removed, markChainSuccessors won't be called for its chain. Instead we
2985  // call markBlockSuccessors for LPred to achieve the same effect. This must go
2986  // at the end because repeating the tail duplication can increase the number
2987  // of unscheduled predecessors.
2988  LPred = *std::prev(Chain.end());
2989  if (DuplicatedToOriginalLPred)
2990  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
2991  return true;
2992 }
2993 
2994 /// Tail duplicate \p BB into (some) predecessors if profitable.
2995 /// \p BB - Basic block that may be duplicated
2996 /// \p LPred - Chosen layout predecessor of \p BB
2997 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2998 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2999 /// Used to identify which blocks to update predecessor
3000 /// counts.
3001 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3002 /// chosen in the given order due to unnatural CFG
3003 /// only needed if \p BB is removed and
3004 /// \p PrevUnplacedBlockIt pointed to \p BB.
3005 /// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3006 /// \return - True if the block was duplicated into all preds and removed.
3007 bool MachineBlockPlacement::maybeTailDuplicateBlock(
3009  BlockChain &Chain, BlockFilterSet *BlockFilter,
3010  MachineFunction::iterator &PrevUnplacedBlockIt,
3011  bool &DuplicatedToLPred) {
3012  DuplicatedToLPred = false;
3013  if (!shouldTailDuplicate(BB))
3014  return false;
3015 
3016  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3017  << "\n");
3018 
3019  // This has to be a callback because none of it can be done after
3020  // BB is deleted.
3021  bool Removed = false;
3022  auto RemovalCallback =
3023  [&](MachineBasicBlock *RemBB) {
3024  // Signal to outer function
3025  Removed = true;
3026 
3027  // Conservative default.
3028  bool InWorkList = true;
3029  // Remove from the Chain and Chain Map
3030  if (BlockToChain.count(RemBB)) {
3031  BlockChain *Chain = BlockToChain[RemBB];
3032  InWorkList = Chain->UnscheduledPredecessors == 0;
3033  Chain->remove(RemBB);
3034  BlockToChain.erase(RemBB);
3035  }
3036 
3037  // Handle the unplaced block iterator
3038  if (&(*PrevUnplacedBlockIt) == RemBB) {
3039  PrevUnplacedBlockIt++;
3040  }
3041 
3042  // Handle the Work Lists
3043  if (InWorkList) {
3044  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
3045  if (RemBB->isEHPad())
3046  RemoveList = EHPadWorkList;
3047  llvm::erase_value(RemoveList, RemBB);
3048  }
3049 
3050  // Handle the filter set
3051  if (BlockFilter) {
3052  BlockFilter->remove(RemBB);
3053  }
3054 
3055  // Remove the block from loop info.
3056  MLI->removeBlock(RemBB);
3057  if (RemBB == PreferredLoopExit)
3058  PreferredLoopExit = nullptr;
3059 
3060  LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
3061  << getBlockName(RemBB) << "\n");
3062  };
3063  auto RemovalCallbackRef =
3064  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
3065 
3066  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
3067  bool IsSimple = TailDup.isSimpleBB(BB);
3068  SmallVector<MachineBasicBlock *, 8> CandidatePreds;
3069  SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3070  if (F->getFunction().hasProfileData()) {
3071  // We can do partial duplication with precise profile information.
3072  findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3073  if (CandidatePreds.size() == 0)
3074  return false;
3075  if (CandidatePreds.size() < BB->pred_size())
3076  CandidatePtr = &CandidatePreds;
3077  }
3078  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3079  &RemovalCallbackRef, CandidatePtr);
3080 
3081  // Update UnscheduledPredecessors to reflect tail-duplication.
3082  DuplicatedToLPred = false;
3083  for (MachineBasicBlock *Pred : DuplicatedPreds) {
3084  // We're only looking for unscheduled predecessors that match the filter.
3085  BlockChain* PredChain = BlockToChain[Pred];
3086  if (Pred == LPred)
3087  DuplicatedToLPred = true;
3088  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
3089  || PredChain == &Chain)
3090  continue;
3091  for (MachineBasicBlock *NewSucc : Pred->successors()) {
3092  if (BlockFilter && !BlockFilter->count(NewSucc))
3093  continue;
3094  BlockChain *NewChain = BlockToChain[NewSucc];
3095  if (NewChain != &Chain && NewChain != PredChain)
3096  NewChain->UnscheduledPredecessors++;
3097  }
3098  }
3099  return Removed;
3100 }
3101 
3102 // Count the number of actual machine instructions.
3104  uint64_t InstrCount = 0;
3105  for (MachineInstr &MI : *MBB) {
3106  if (!MI.isPHI() && !MI.isMetaInstruction())
3107  InstrCount += 1;
3108  }
3109  return InstrCount;
3110 }
3111 
3112 // The size cost of duplication is the instruction size of the duplicated block.
3113 // So we should scale the threshold accordingly. But the instruction size is not
3114 // available on all targets, so we use the number of instructions instead.
3115 BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3116  return DupThreshold.getFrequency() * countMBBInstruction(BB);
3117 }
3118 
3119 // Returns true if BB is Pred's best successor.
3120 bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3121  MachineBasicBlock *Pred,
3122  BlockFilterSet *BlockFilter) {
3123  if (BB == Pred)
3124  return false;
3125  if (BlockFilter && !BlockFilter->count(Pred))
3126  return false;
3127  BlockChain *PredChain = BlockToChain[Pred];
3128  if (PredChain && (Pred != *std::prev(PredChain->end())))
3129  return false;
3130 
3131  // Find the successor with largest probability excluding BB.
3133  for (MachineBasicBlock *Succ : Pred->successors())
3134  if (Succ != BB) {
3135  if (BlockFilter && !BlockFilter->count(Succ))
3136  continue;
3137  BlockChain *SuccChain = BlockToChain[Succ];
3138  if (SuccChain && (Succ != *SuccChain->begin()))
3139  continue;
3140  BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3141  if (SuccProb > BestProb)
3142  BestProb = SuccProb;
3143  }
3144 
3145  BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3146  if (BBProb <= BestProb)
3147  return false;
3148 
3149  // Compute the number of reduced taken branches if Pred falls through to BB
3150  // instead of another successor. Then compare it with threshold.
3151  BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3152  BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3153  return Gain > scaleThreshold(BB);
3154 }
3155 
3156 // Find out the predecessors of BB and BB can be beneficially duplicated into
3157 // them.
3158 void MachineBlockPlacement::findDuplicateCandidates(
3161  BlockFilterSet *BlockFilter) {
3162  MachineBasicBlock *Fallthrough = nullptr;
3163  BranchProbability DefaultBranchProb = BranchProbability::getZero();
3164  BlockFrequency BBDupThreshold(scaleThreshold(BB));
3165  SmallVector<MachineBasicBlock *, 8> Preds(BB->predecessors());
3166  SmallVector<MachineBasicBlock *, 8> Succs(BB->successors());
3167 
3168  // Sort for highest frequency.
3169  auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3170  return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3171  };
3172  auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3173  return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3174  };
3175  llvm::stable_sort(Succs, CmpSucc);
3176  llvm::stable_sort(Preds, CmpPred);
3177 
3178  auto SuccIt = Succs.begin();
3179  if (SuccIt != Succs.end()) {
3180  DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3181  }
3182 
3183  // For each predecessors of BB, compute the benefit of duplicating BB,
3184  // if it is larger than the threshold, add it into Candidates.
3185  //
3186  // If we have following control flow.
3187  //
3188  // PB1 PB2 PB3 PB4
3189  // \ | / /\
3190  // \ | / / \
3191  // \ |/ / \
3192  // BB----/ OB
3193  // /\
3194  // / \
3195  // SB1 SB2
3196  //
3197  // And it can be partially duplicated as
3198  //
3199  // PB2+BB
3200  // | PB1 PB3 PB4
3201  // | | / /\
3202  // | | / / \
3203  // | |/ / \
3204  // | BB----/ OB
3205  // |\ /|
3206  // | X |
3207  // |/ \|
3208  // SB2 SB1
3209  //
3210  // The benefit of duplicating into a predecessor is defined as
3211  // Orig_taken_branch - Duplicated_taken_branch
3212  //
3213  // The Orig_taken_branch is computed with the assumption that predecessor
3214  // jumps to BB and the most possible successor is laid out after BB.
3215  //
3216  // The Duplicated_taken_branch is computed with the assumption that BB is
3217  // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3218  // SB2 for PB2 in our case). If there is no available successor, the combined
3219  // block jumps to all BB's successor, like PB3 in this example.
3220  //
3221  // If a predecessor has multiple successors, so BB can't be duplicated into
3222  // it. But it can beneficially fall through to BB, and duplicate BB into other
3223  // predecessors.
3224  for (MachineBasicBlock *Pred : Preds) {
3225  BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3226 
3227  if (!TailDup.canTailDuplicate(BB, Pred)) {
3228  // BB can't be duplicated into Pred, but it is possible to be layout
3229  // below Pred.
3230  if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3231  Fallthrough = Pred;
3232  if (SuccIt != Succs.end())
3233  SuccIt++;
3234  }
3235  continue;
3236  }
3237 
3238  BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3239  BlockFrequency DupCost;
3240  if (SuccIt == Succs.end()) {
3241  // Jump to all successors;
3242  if (Succs.size() > 0)
3243  DupCost += PredFreq;
3244  } else {
3245  // Fallthrough to *SuccIt, jump to all other successors;
3246  DupCost += PredFreq;
3247  DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3248  }
3249 
3250  assert(OrigCost >= DupCost);
3251  OrigCost -= DupCost;
3252  if (OrigCost > BBDupThreshold) {
3253  Candidates.push_back(Pred);
3254  if (SuccIt != Succs.end())
3255  SuccIt++;
3256  }
3257  }
3258 
3259  // No predecessors can optimally fallthrough to BB.
3260  // So we can change one duplication into fallthrough.
3261  if (!Fallthrough) {
3262  if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3263  Candidates[0] = Candidates.back();
3264  Candidates.pop_back();
3265  }
3266  }
3267 }
3268 
3269 void MachineBlockPlacement::initDupThreshold() {
3270  DupThreshold = 0;
3271  if (!F->getFunction().hasProfileData())
3272  return;
3273 
3274  // We prefer to use prifile count.
3275  uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3276  if (HotThreshold != UINT64_MAX) {
3277  UseProfileCount = true;
3278  DupThreshold = HotThreshold * TailDupProfilePercentThreshold / 100;
3279  return;
3280  }
3281 
3282  // Profile count is not available, we can use block frequency instead.
3283  BlockFrequency MaxFreq = 0;
3284  for (MachineBasicBlock &MBB : *F) {
3285  BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3286  if (Freq > MaxFreq)
3287  MaxFreq = Freq;
3288  }
3289 
3290  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3291  DupThreshold = MaxFreq * ThresholdProb;
3292  UseProfileCount = false;
3293 }
3294 
3295 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3296  if (skipFunction(MF.getFunction()))
3297  return false;
3298 
3299  // Check for single-block functions and skip them.
3300  if (std::next(MF.begin()) == MF.end())
3301  return false;
3302 
3303  F = &MF;
3304  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3305  MBFI = std::make_unique<MBFIWrapper>(
3306  getAnalysis<MachineBlockFrequencyInfo>());
3307  MLI = &getAnalysis<MachineLoopInfo>();
3308  TII = MF.getSubtarget().getInstrInfo();
3309  TLI = MF.getSubtarget().getTargetLowering();
3310  MPDT = nullptr;
3311  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
3312 
3313  initDupThreshold();
3314 
3315  // Initialize PreferredLoopExit to nullptr here since it may never be set if
3316  // there are no MachineLoops.
3317  PreferredLoopExit = nullptr;
3318 
3319  assert(BlockToChain.empty() &&
3320  "BlockToChain map should be empty before starting placement.");
3321  assert(ComputedEdges.empty() &&
3322  "Computed Edge map should be empty before starting placement.");
3323 
3324  unsigned TailDupSize = TailDupPlacementThreshold;
3325  // If only the aggressive threshold is explicitly set, use it.
3326  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3327  TailDupPlacementThreshold.getNumOccurrences() == 0)
3329 
3330  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
3331  // For aggressive optimization, we can adjust some thresholds to be less
3332  // conservative.
3333  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
3334  // At O3 we should be more willing to copy blocks for tail duplication. This
3335  // increases size pressure, so we only do it at O3
3336  // Do this unless only the regular threshold is explicitly set.
3337  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3338  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3340  }
3341 
3342  // If there's no threshold provided through options, query the target
3343  // information for a threshold instead.
3344  if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3345  (PassConfig->getOptLevel() < CodeGenOpt::Aggressive ||
3346  TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3347  TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
3348 
3349  if (allowTailDupPlacement()) {
3350  MPDT = &getAnalysis<MachinePostDominatorTree>();
3351  bool OptForSize = MF.getFunction().hasOptSize() ||
3352  llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3353  if (OptForSize)
3354  TailDupSize = 1;
3355  bool PreRegAlloc = false;
3356  TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3357  /* LayoutMode */ true, TailDupSize);
3358  precomputeTriangleChains();
3359  }
3360 
3361  buildCFGChains();
3362 
3363  // Changing the layout can create new tail merging opportunities.
3364  // TailMerge can create jump into if branches that make CFG irreducible for
3365  // HW that requires structured CFG.
3366  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3367  PassConfig->getEnableTailMerge() &&
3369  // No tail merging opportunities if the block number is less than four.
3370  if (MF.size() > 3 && EnableTailMerge) {
3371  unsigned TailMergeSize = TailDupSize + 1;
3372  BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3373  *MBFI, *MBPI, PSI, TailMergeSize);
3374 
3375  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3376  /*AfterPlacement=*/true)) {
3377  // Redo the layout if tail merging creates/removes/moves blocks.
3378  BlockToChain.clear();
3379  ComputedEdges.clear();
3380  // Must redo the post-dominator tree if blocks were changed.
3381  if (MPDT)
3382  MPDT->runOnMachineFunction(MF);
3383  ChainAllocator.DestroyAll();
3384  buildCFGChains();
3385  }
3386  }
3387 
3388  optimizeBranches();
3389  alignBlocks();
3390 
3391  BlockToChain.clear();
3392  ComputedEdges.clear();
3393  ChainAllocator.DestroyAll();
3394 
3395  if (AlignAllBlock)
3396  // Align all of the blocks in the function to a specific alignment.
3397  for (MachineBasicBlock &MBB : MF)
3398  MBB.setAlignment(Align(1ULL << AlignAllBlock));
3399  else if (AlignAllNonFallThruBlocks) {
3400  // Align all of the blocks that have no fall-through predecessors to a
3401  // specific alignment.
3402  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
3403  auto LayoutPred = std::prev(MBI);
3404  if (!LayoutPred->isSuccessor(&*MBI))
3405  MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3406  }
3407  }
3409  (ViewBlockFreqFuncName.empty() ||
3410  F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
3411  MBFI->view("MBP." + MF.getName(), false);
3412  }
3413 
3414 
3415  // We always return true as we have no way to track whether the final order
3416  // differs from the original order.
3417  return true;
3418 }
3419 
3420 namespace {
3421 
3422 /// A pass to compute block placement statistics.
3423 ///
3424 /// A separate pass to compute interesting statistics for evaluating block
3425 /// placement. This is separate from the actual placement pass so that they can
3426 /// be computed in the absence of any placement transformations or when using
3427 /// alternative placement strategies.
3428 class MachineBlockPlacementStats : public MachineFunctionPass {
3429  /// A handle to the branch probability pass.
3430  const MachineBranchProbabilityInfo *MBPI;
3431 
3432  /// A handle to the function-wide block frequency pass.
3433  const MachineBlockFrequencyInfo *MBFI;
3434 
3435 public:
3436  static char ID; // Pass identification, replacement for typeid
3437 
3438  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3440  }
3441 
3442  bool runOnMachineFunction(MachineFunction &F) override;
3443 
3444  void getAnalysisUsage(AnalysisUsage &AU) const override {
3447  AU.setPreservesAll();
3449  }
3450 };
3451 
3452 } // end anonymous namespace
3453 
3455 
3457 
3458 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3459  "Basic Block Placement Stats", false, false)
3462 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3463  "Basic Block Placement Stats", false, false)
3464 
3465 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3466  // Check for single-block functions and skip them.
3467  if (std::next(F.begin()) == F.end())
3468  return false;
3469 
3470  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3471  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
3472 
3473  for (MachineBasicBlock &MBB : F) {
3474  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3475  Statistic &NumBranches =
3476  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3477  Statistic &BranchTakenFreq =
3478  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3479  for (MachineBasicBlock *Succ : MBB.successors()) {
3480  // Skip if this successor is a fallthrough.
3481  if (MBB.isLayoutSuccessor(Succ))
3482  continue;
3483 
3484  BlockFrequency EdgeFreq =
3485  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3486  ++NumBranches;
3487  BranchTakenFreq += EdgeFreq.getFrequency();
3488  }
3489  }
3490 
3491  return false;
3492 }
llvm::TargetMachine::requiresStructuredCFG
bool requiresStructuredCFG() const
Definition: TargetMachine.h:213
llvm::BranchFolder
Definition: BranchFolding.h:33
i
i
Definition: README.txt:29
block
we get the following basic block
Definition: README_ALTIVEC.txt:95
llvm::MachineBasicBlock::succ_size
unsigned succ_size() const
Definition: MachineBasicBlock.h:344
llvm::MachineBasicBlock::pred_begin
pred_iterator pred_begin()
Definition: MachineBasicBlock.h:316
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:100
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:499
llvm
Definition: AllocatorList.h:23
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:705
llvm::TargetLoweringBase
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
Definition: TargetLowering.h:189
llvm::TailDuplicator::initMF
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
Definition: TailDuplicator.cpp:81
llvm::MachineLoopInfo::getLoopFor
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
Definition: MachineLoopInfo.h:124
llvm::HexagonInstrInfo::removeBranch
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
Definition: HexagonInstrInfo.cpp:561
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:197
llvm::BranchProbability::getNumerator
uint32_t getNumerator() const
Definition: BranchProbability.h:65
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
Pass.h
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:122
llvm::TargetSubtargetInfo::getInstrInfo
virtual const TargetInstrInfo * getInstrInfo() const
Definition: TargetSubtargetInfo.h:92
llvm::HexagonInstrInfo::analyzeBranch
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
Definition: HexagonInstrInfo.cpp:391
llvm::raw_string_ostream
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:614
getAdjustedProbability
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
Definition: MachineBlockPlacement.cpp:718
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
Statistic.h
llvm::MachineFunction::end
iterator end()
Definition: MachineFunction.h:742
llvm::ProfileSummaryInfo::getOrCompHotCountThreshold
uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
Definition: ProfileSummaryInfo.cpp:318
llvm::printMBBReference
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Definition: MachineBasicBlock.cpp:118
Allocator.h
llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:1651
llvm::MachineBasicBlock::isEntryBlock
bool isEntryBlock() const
Returns true if this is the entry block of the function.
Definition: MachineBasicBlock.cpp:289
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
MachineSizeOpts.h
llvm::initializeMachineBlockPlacementPass
void initializeMachineBlockPlacementPass(PassRegistry &)
MachineBasicBlock.h
llvm::TargetSubtargetInfo::getRegisterInfo
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
Definition: TargetSubtargetInfo.h:124
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::erase
bool erase(const KeyT &Val)
Definition: DenseMap.h:302
AlignAllBlock
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::begin
iterator begin()
Definition: DenseMap.h:74
InstrCount
static unsigned InstrCount
Definition: DFAPacketizer.cpp:53
llvm::SpecificBumpPtrAllocator
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:376
llvm::BlockFrequency::getMaxFrequency
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
Definition: BlockFrequency.h:31
BlockFrequency.h
DenseMap.h
llvm::TargetLoweringBase::getPrefLoopAlignment
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
Definition: TargetLowering.h:1741
llvm::BranchProbability::getZero
static BranchProbability getZero()
Definition: BranchProbability.h:49
llvm::reverse
auto reverse(ContainerTy &&C, std::enable_if_t< has_rbegin< ContainerTy >::value > *=nullptr)
Definition: STLExtras.h:329
TargetInstrInfo.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
llvm::TailDuplicator::shouldTailDuplicate
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
Definition: TailDuplicator.cpp:551
getLayoutSuccessorProbThreshold
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
Definition: MachineBlockPlacement.cpp:1347
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:876
STLExtras.h
llvm::MachineBasicBlock::back
MachineInstr & back()
Definition: MachineBasicBlock.h:248
llvm::TargetLoweringBase::alignLoopsWithOptSize
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
Definition: TargetLowering.h:1747
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
llvm::MachineBlockFrequencyInfo::getBlockFreq
BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
Definition: MachineBlockFrequencyInfo.cpp:230
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:102
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::MachineLoopInfo
Definition: MachineLoopInfo.h:90
llvm::MachineBasicBlock::succ_rbegin
succ_reverse_iterator succ_rbegin()
Definition: MachineBasicBlock.h:336
llvm::LoopBase::block_end
block_iterator block_end() const
Definition: LoopInfo.h:177
llvm::MachineBasicBlock::dump
void dump() const
Definition: MachineBasicBlock.cpp:294
llvm::TailDuplicator::isSimpleBB
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
Definition: TailDuplicator.cpp:681
llvm::sys::path::append
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:454
TailDupPlacementThreshold
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
CommandLine.h
llvm::MachineBasicBlock::pred_size
unsigned pred_size() const
Definition: MachineBasicBlock.h:328
TargetLowering.h
llvm::shouldOptimizeForSize
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
Definition: MachineSizeOpts.cpp:183
BlockFrequencyInfoImpl.h
llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition: TargetInstrInfo.h:97
MachineLoopInfo.h
TargetMachine.h
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::MutableArrayRef
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:305
llvm::MachineBlockFrequencyInfo
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
Definition: MachineBlockFrequencyInfo.h:33
llvm::MachineBranchProbabilityInfo
Definition: MachineBranchProbabilityInfo.h:24
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::MachineBasicBlock::isSuccessor
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
Definition: MachineBasicBlock.cpp:933
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
llvm::TargetPassConfig::getEnableTailMerge
bool getEnableTailMerge() const
Definition: TargetPassConfig.h:176
UINT64_MAX
#define UINT64_MAX
Definition: DataTypes.h:77
false
Definition: StackSlotColoring.cpp:142
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition: LoopDeletion.cpp:42
llvm::MachineFunction::size
unsigned size() const
Definition: MachineFunction.h:750
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::StaticLikelyProb
cl::opt< unsigned > StaticLikelyProb
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
AlignAllNonFallThruBlocks
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
llvm::HexagonInstrInfo::insertBranch
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
Definition: HexagonInstrInfo.cpp:584
llvm::raw_ostream::flush
void flush()
Definition: raw_ostream.h:183
llvm::MachineFunction::begin
iterator begin()
Definition: MachineFunction.h:740
llvm::ProfileLikelyProb
cl::opt< unsigned > ProfileLikelyProb
DebugLoc.h
SmallPtrSet.h
JumpInstCost
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
BranchFolding.h
TailDupPlacement
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
llvm::erase_value
void erase_value(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:1659
BranchProbability.h
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:39
BranchFoldPlacement
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
llvm::MachineBasicBlock::setAlignment
void setAlignment(Align A)
Set alignment of the basic block.
Definition: MachineBasicBlock.h:431
llvm::LoopBase::block_begin
block_iterator block_begin() const
Definition: LoopInfo.h:176
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:168
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:558
llvm::cl::opt
Definition: CommandLine.h:1422
TriangleChainCount
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
TailDupProfilePercentThreshold
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
llvm::MachineLoop
Definition: MachineLoopInfo.h:45
llvm::BlockFrequency
Definition: BlockFrequency.h:24
TailMergeSize
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::clear
void clear()
Definition: DenseMap.h:111
llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1565
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
ProfileSummaryInfo.h
llvm::ViewBlockLayoutWithBFI
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
Definition: MachineBlockPlacement.cpp:203
llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1502
ExitBlockBias
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
llvm::DenseMap
Definition: DenseMap.h:714
llvm::TailDuplicator::tailDuplicateAndUpdate
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
Definition: TailDuplicator.cpp:158
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::BlockFrequency::getFrequency
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
Definition: BlockFrequency.h:35
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
greaterWithBias
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
Definition: MachineBlockPlacement.cpp:766
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:185
ArrayRef.h
TargetPassConfig.h
MachineFunctionPass.h
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
llvm::SmallVectorImpl::const_iterator
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:563
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineBasicBlock::canFallThrough
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
Definition: MachineBasicBlock.cpp:986
LoopToColdBlockRatio
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
MachineBranchProbabilityInfo.h
llvm::MachineBasicBlock::succ_begin
succ_iterator succ_begin()
Definition: MachineBasicBlock.h:332
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:840
hasSameSuccessors
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
Definition: MachineBlockPlacement.cpp:733
MachineModuleInfo.h
MachinePostDominators.h
llvm::MachineBasicBlock::predecessors
iterator_range< pred_iterator > predecessors()
Definition: MachineBasicBlock.h:349
llvm::MachineBranchProbabilityInfo::getEdgeProbability
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
Definition: MachineBranchProbabilityInfo.cpp:58
llvm::CodeGenOpt::Aggressive
@ Aggressive
Definition: CodeGen.h:56
llvm::MachineBlockPlacementStatsID
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
Definition: MachineBlockPlacement.cpp:3456
MisfetchCost
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
llvm::MachineFunction
Definition: MachineFunction.h:230
llvm::BranchProbability::getOne
static BranchProbability getOne()
Definition: BranchProbability.h:50
llvm::sys::fs::remove
std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
llvm::MachineBasicBlock::getNumber
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
Definition: MachineBasicBlock.h:969
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:167
llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition: MachineBasicBlock.h:355
llvm::MachineBasicBlock::isEHPad
bool isEHPad() const
Returns true if the block is a landing pad.
Definition: MachineBasicBlock.h:435
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
uint32_t
llvm::BranchProbability
Definition: BranchProbability.h:30
Compiler.h
TargetSubtargetInfo.h
llvm::MachineBlockPlacementID
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
Definition: MachineBlockPlacement.cpp:590
PreciseRotationCost
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
llvm::TrackingStatistic
Definition: Statistic.h:49
for
this could be done in SelectionDAGISel along with other special for
Definition: README.txt:104
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
llvm::SpecificBumpPtrAllocator::Allocate
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:426
llvm::SmallPtrSetImplBase::size
size_type size() const
Definition: SmallPtrSet.h:92
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:716
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::empty
LLVM_NODISCARD bool empty() const
Definition: DenseMap.h:97
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::MachinePostDominatorTree::dominates
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Definition: MachinePostDominators.h:54
getBlockName
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
Definition: MachineBlockPlacement.cpp:606
stats
Definition: MCAssembler.cpp:52
llvm::stable_sort
void stable_sort(R &&Range)
Definition: STLExtras.h:1617
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:524
Stats
block placement Basic Block Placement Stats
Definition: MachineBlockPlacement.cpp:3463
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
ForcePreciseRotationCost
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
llvm::MachineFunction::getTarget
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition: MachineFunction.h:554
rotate
The same transformation can work with an even modulo with the addition of a rotate
Definition: README.txt:680
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::end
iterator end()
Definition: DenseMap.h:83
llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::MCID::Branch
@ Branch
Definition: MCInstrDesc.h:156
Function.h
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::initializeMachineBlockPlacementStatsPass
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
llvm::MachineBasicBlock::isLayoutSuccessor
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
Definition: MachineBasicBlock.cpp:937
ForceLoopColdBlock
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
llvm::BranchProbability::getCompl
BranchProbability getCompl() const
Definition: BranchProbability.h:69
llvm::MachinePostDominatorTree::runOnMachineFunction
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Definition: MachinePostDominators.cpp:40
CodeGen.h
llvm::TailDuplicator::canTailDuplicate
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
Definition: TailDuplicator.cpp:796
llvm::HexagonInstrInfo::reverseBranchCondition
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
Definition: HexagonInstrInfo.cpp:1547
llvm::TargetSubtargetInfo::getTargetLowering
virtual const TargetLowering * getTargetLowering() const
Definition: TargetSubtargetInfo.h:96
Placement
Branch Probability Basic Block Placement
Definition: MachineBlockPlacement.cpp:600
SmallVector.h
llvm::TailDuplicator
Utility class to perform tail duplication.
Definition: TailDuplicator.h:39
llvm::GVDT_None
@ GVDT_None
Definition: BlockFrequencyInfoImpl.h:1479
DEBUG_TYPE
#define DEBUG_TYPE
Definition: MachineBlockPlacement.cpp:76
llvm::SmallVectorImpl::iterator
typename SuperClass::iterator iterator
Definition: SmallVector.h:562
llvm::SmallPtrSetImplBase::empty
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:222
TailDuplicator.h
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
TailDupPlacementPenalty
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:307
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::ViewBlockFreqFuncName
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
Definition: MachineBlockFrequencyInfo.cpp:66
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:414
llvm::MachineBasicBlock::updateTerminator
void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
Definition: MachineBasicBlock.cpp:640
raw_ostream.h
TailDupPlacementAggressiveThreshold
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
MachineFunction.h
InitializePasses.h
MachineBlockFrequencyInfo.h
Debug.h
llvm::TargetPassConfig::getOptLevel
CodeGenOpt::Level getOptLevel() const
Definition: TargetPassConfig.cpp:588
SetVector.h
llvm::SpecificBumpPtrAllocator::DestroyAll
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:397
countMBBInstruction
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
Definition: MachineBlockPlacement.cpp:3103
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:908
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38