LLVM  15.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements basic block placement transformations using the CFG
10 // structure and branch probability estimates.
11 //
12 // The pass strives to preserve the structure of the CFG (that is, retain
13 // a topological ordering of basic blocks) in the absence of a *strong* signal
14 // to the contrary from probabilities. However, within the CFG structure, it
15 // attempts to choose an ordering which favors placing more likely sequences of
16 // blocks adjacent to each other.
17 //
18 // The algorithm works from the inner-most loop within a function outward, and
19 // at each stage walks through the basic blocks, trying to coalesce them into
20 // sequential chains where allowed by the CFG (or demanded by heavy
21 // probabilities). Finally, it walks the blocks in topological order, and the
22 // first time it reaches a chain of basic blocks, it schedules them in the
23 // function in-order.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "BranchFolding.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
51 #include "llvm/IR/DebugLoc.h"
52 #include "llvm/IR/Function.h"
53 #include "llvm/InitializePasses.h"
54 #include "llvm/Pass.h"
55 #include "llvm/Support/Allocator.h"
58 #include "llvm/Support/CodeGen.h"
60 #include "llvm/Support/Compiler.h"
61 #include "llvm/Support/Debug.h"
65 #include <algorithm>
66 #include <cassert>
67 #include <cstdint>
68 #include <iterator>
69 #include <memory>
70 #include <string>
71 #include <tuple>
72 #include <utility>
73 #include <vector>
74 
75 using namespace llvm;
76 
77 #define DEBUG_TYPE "block-placement"
78 
79 STATISTIC(NumCondBranches, "Number of conditional branches");
80 STATISTIC(NumUncondBranches, "Number of unconditional branches");
81 STATISTIC(CondBranchTakenFreq,
82  "Potential frequency of taking conditional branches");
83 STATISTIC(UncondBranchTakenFreq,
84  "Potential frequency of taking unconditional branches");
85 
87  "align-all-blocks",
88  cl::desc("Force the alignment of all blocks in the function in log2 format "
89  "(e.g 4 means align on 16B boundaries)."),
90  cl::init(0), cl::Hidden);
91 
93  "align-all-nofallthru-blocks",
94  cl::desc("Force the alignment of all blocks that have no fall-through "
95  "predecessors (i.e. don't add nops that are executed). In log2 "
96  "format (e.g 4 means align on 16B boundaries)."),
97  cl::init(0), cl::Hidden);
98 
100  "max-bytes-for-alignment",
101  cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
102  "alignment"),
103  cl::init(0), cl::Hidden);
104 
105 // FIXME: Find a good default for this flag and remove the flag.
107  "block-placement-exit-block-bias",
108  cl::desc("Block frequency percentage a loop exit block needs "
109  "over the original exit to be considered the new exit."),
110  cl::init(0), cl::Hidden);
111 
112 // Definition:
113 // - Outlining: placement of a basic block outside the chain or hot path.
114 
116  "loop-to-cold-block-ratio",
117  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
118  "(frequency of block) is greater than this ratio"),
119  cl::init(5), cl::Hidden);
120 
122  "force-loop-cold-block",
123  cl::desc("Force outlining cold blocks from loops."),
124  cl::init(false), cl::Hidden);
125 
126 static cl::opt<bool>
127  PreciseRotationCost("precise-rotation-cost",
128  cl::desc("Model the cost of loop rotation more "
129  "precisely by using profile data."),
130  cl::init(false), cl::Hidden);
131 
132 static cl::opt<bool>
133  ForcePreciseRotationCost("force-precise-rotation-cost",
134  cl::desc("Force the use of precise cost "
135  "loop rotation strategy."),
136  cl::init(false), cl::Hidden);
137 
139  "misfetch-cost",
140  cl::desc("Cost that models the probabilistic risk of an instruction "
141  "misfetch due to a jump comparing to falling through, whose cost "
142  "is zero."),
143  cl::init(1), cl::Hidden);
144 
145 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
146  cl::desc("Cost of jump instructions."),
147  cl::init(1), cl::Hidden);
148 static cl::opt<bool>
149 TailDupPlacement("tail-dup-placement",
150  cl::desc("Perform tail duplication during placement. "
151  "Creates more fallthrough opportunites in "
152  "outline branches."),
153  cl::init(true), cl::Hidden);
154 
155 static cl::opt<bool>
156 BranchFoldPlacement("branch-fold-placement",
157  cl::desc("Perform branch folding during placement. "
158  "Reduces code size."),
159  cl::init(true), cl::Hidden);
160 
161 // Heuristic for tail duplication.
163  "tail-dup-placement-threshold",
164  cl::desc("Instruction cutoff for tail duplication during layout. "
165  "Tail merging during layout is forced to have a threshold "
166  "that won't conflict."), cl::init(2),
167  cl::Hidden);
168 
169 // Heuristic for aggressive tail duplication.
171  "tail-dup-placement-aggressive-threshold",
172  cl::desc("Instruction cutoff for aggressive tail duplication during "
173  "layout. Used at -O3. Tail merging during layout is forced to "
174  "have a threshold that won't conflict."), cl::init(4),
175  cl::Hidden);
176 
177 // Heuristic for tail duplication.
179  "tail-dup-placement-penalty",
180  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
181  "Copying can increase fallthrough, but it also increases icache "
182  "pressure. This parameter controls the penalty to account for that. "
183  "Percent as integer."),
184  cl::init(2),
185  cl::Hidden);
186 
187 // Heuristic for tail duplication if profile count is used in cost model.
189  "tail-dup-profile-percent-threshold",
190  cl::desc("If profile count information is used in tail duplication cost "
191  "model, the gained fall through number from tail duplication "
192  "should be at least this percent of hot count."),
193  cl::init(50), cl::Hidden);
194 
195 // Heuristic for triangle chains.
197  "triangle-chain-count",
198  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
199  "triangle tail duplication heuristic to kick in. 0 to disable."),
200  cl::init(2),
201  cl::Hidden);
202 
205 
206 namespace llvm {
209 
210 // Internal option used to control BFI display only after MBP pass.
211 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
212 // -view-block-layout-with-bfi=
214 
215 // Command line option to specify the name of the function for CFG dump
216 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
218 } // namespace llvm
219 
220 namespace {
221 
222 class BlockChain;
223 
224 /// Type for our function-wide basic block -> block chain mapping.
225 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
226 
227 /// A chain of blocks which will be laid out contiguously.
228 ///
229 /// This is the datastructure representing a chain of consecutive blocks that
230 /// are profitable to layout together in order to maximize fallthrough
231 /// probabilities and code locality. We also can use a block chain to represent
232 /// a sequence of basic blocks which have some external (correctness)
233 /// requirement for sequential layout.
234 ///
235 /// Chains can be built around a single basic block and can be merged to grow
236 /// them. They participate in a block-to-chain mapping, which is updated
237 /// automatically as chains are merged together.
238 class BlockChain {
239  /// The sequence of blocks belonging to this chain.
240  ///
241  /// This is the sequence of blocks for a particular chain. These will be laid
242  /// out in-order within the function.
244 
245  /// A handle to the function-wide basic block to block chain mapping.
246  ///
247  /// This is retained in each block chain to simplify the computation of child
248  /// block chains for SCC-formation and iteration. We store the edges to child
249  /// basic blocks, and map them back to their associated chains using this
250  /// structure.
251  BlockToChainMapType &BlockToChain;
252 
253 public:
254  /// Construct a new BlockChain.
255  ///
256  /// This builds a new block chain representing a single basic block in the
257  /// function. It also registers itself as the chain that block participates
258  /// in with the BlockToChain mapping.
259  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
260  : Blocks(1, BB), BlockToChain(BlockToChain) {
261  assert(BB && "Cannot create a chain with a null basic block");
262  BlockToChain[BB] = this;
263  }
264 
265  /// Iterator over blocks within the chain.
268 
269  /// Beginning of blocks within the chain.
270  iterator begin() { return Blocks.begin(); }
271  const_iterator begin() const { return Blocks.begin(); }
272 
273  /// End of blocks within the chain.
274  iterator end() { return Blocks.end(); }
275  const_iterator end() const { return Blocks.end(); }
276 
277  bool remove(MachineBasicBlock* BB) {
278  for(iterator i = begin(); i != end(); ++i) {
279  if (*i == BB) {
280  Blocks.erase(i);
281  return true;
282  }
283  }
284  return false;
285  }
286 
287  /// Merge a block chain into this one.
288  ///
289  /// This routine merges a block chain into this one. It takes care of forming
290  /// a contiguous sequence of basic blocks, updating the edge list, and
291  /// updating the block -> chain mapping. It does not free or tear down the
292  /// old chain, but the old chain's block list is no longer valid.
293  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
294  assert(BB && "Can't merge a null block.");
295  assert(!Blocks.empty() && "Can't merge into an empty chain.");
296 
297  // Fast path in case we don't have a chain already.
298  if (!Chain) {
299  assert(!BlockToChain[BB] &&
300  "Passed chain is null, but BB has entry in BlockToChain.");
301  Blocks.push_back(BB);
302  BlockToChain[BB] = this;
303  return;
304  }
305 
306  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
307  assert(Chain->begin() != Chain->end());
308 
309  // Update the incoming blocks to point to this chain, and add them to the
310  // chain structure.
311  for (MachineBasicBlock *ChainBB : *Chain) {
312  Blocks.push_back(ChainBB);
313  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
314  BlockToChain[ChainBB] = this;
315  }
316  }
317 
318 #ifndef NDEBUG
319  /// Dump the blocks in this chain.
320  LLVM_DUMP_METHOD void dump() {
321  for (MachineBasicBlock *MBB : *this)
322  MBB->dump();
323  }
324 #endif // NDEBUG
325 
326  /// Count of predecessors of any block within the chain which have not
327  /// yet been scheduled. In general, we will delay scheduling this chain
328  /// until those predecessors are scheduled (or we find a sufficiently good
329  /// reason to override this heuristic.) Note that when forming loop chains,
330  /// blocks outside the loop are ignored and treated as if they were already
331  /// scheduled.
332  ///
333  /// Note: This field is reinitialized multiple times - once for each loop,
334  /// and then once for the function as a whole.
335  unsigned UnscheduledPredecessors = 0;
336 };
337 
338 class MachineBlockPlacement : public MachineFunctionPass {
339  /// A type for a block filter set.
340  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
341 
342  /// Pair struct containing basic block and taildup profitability
343  struct BlockAndTailDupResult {
345  bool ShouldTailDup;
346  };
347 
348  /// Triple struct containing edge weight and the edge.
349  struct WeightedEdge {
350  BlockFrequency Weight;
351  MachineBasicBlock *Src;
352  MachineBasicBlock *Dest;
353  };
354 
355  /// work lists of blocks that are ready to be laid out
358 
359  /// Edges that have already been computed as optimal.
361 
362  /// Machine Function
364 
365  /// A handle to the branch probability pass.
366  const MachineBranchProbabilityInfo *MBPI;
367 
368  /// A handle to the function-wide block frequency pass.
369  std::unique_ptr<MBFIWrapper> MBFI;
370 
371  /// A handle to the loop info.
372  MachineLoopInfo *MLI;
373 
374  /// Preferred loop exit.
375  /// Member variable for convenience. It may be removed by duplication deep
376  /// in the call stack.
377  MachineBasicBlock *PreferredLoopExit;
378 
379  /// A handle to the target's instruction info.
380  const TargetInstrInfo *TII;
381 
382  /// A handle to the target's lowering info.
383  const TargetLoweringBase *TLI;
384 
385  /// A handle to the post dominator tree.
387 
388  ProfileSummaryInfo *PSI;
389 
390  /// Duplicator used to duplicate tails during placement.
391  ///
392  /// Placement decisions can open up new tail duplication opportunities, but
393  /// since tail duplication affects placement decisions of later blocks, it
394  /// must be done inline.
395  TailDuplicator TailDup;
396 
397  /// Partial tail duplication threshold.
398  BlockFrequency DupThreshold;
399 
400  /// True: use block profile count to compute tail duplication cost.
401  /// False: use block frequency to compute tail duplication cost.
402  bool UseProfileCount;
403 
404  /// Allocator and owner of BlockChain structures.
405  ///
406  /// We build BlockChains lazily while processing the loop structure of
407  /// a function. To reduce malloc traffic, we allocate them using this
408  /// slab-like allocator, and destroy them after the pass completes. An
409  /// important guarantee is that this allocator produces stable pointers to
410  /// the chains.
412 
413  /// Function wide BasicBlock to BlockChain mapping.
414  ///
415  /// This mapping allows efficiently moving from any given basic block to the
416  /// BlockChain it participates in, if any. We use it to, among other things,
417  /// allow implicitly defining edges between chains as the existing edges
418  /// between basic blocks.
420 
421 #ifndef NDEBUG
422  /// The set of basic blocks that have terminators that cannot be fully
423  /// analyzed. These basic blocks cannot be re-ordered safely by
424  /// MachineBlockPlacement, and we must preserve physical layout of these
425  /// blocks and their successors through the pass.
426  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
427 #endif
428 
429  /// Get block profile count or frequency according to UseProfileCount.
430  /// The return value is used to model tail duplication cost.
431  BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
432  if (UseProfileCount) {
433  auto Count = MBFI->getBlockProfileCount(BB);
434  if (Count)
435  return *Count;
436  else
437  return 0;
438  } else
439  return MBFI->getBlockFreq(BB);
440  }
441 
442  /// Scale the DupThreshold according to basic block size.
443  BlockFrequency scaleThreshold(MachineBasicBlock *BB);
444  void initDupThreshold();
445 
446  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
447  /// if the count goes to 0, add them to the appropriate work list.
448  void markChainSuccessors(
449  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
450  const BlockFilterSet *BlockFilter = nullptr);
451 
452  /// Decrease the UnscheduledPredecessors count for a single block, and
453  /// if the count goes to 0, add them to the appropriate work list.
454  void markBlockSuccessors(
455  const BlockChain &Chain, const MachineBasicBlock *BB,
456  const MachineBasicBlock *LoopHeaderBB,
457  const BlockFilterSet *BlockFilter = nullptr);
458 
460  collectViableSuccessors(
461  const MachineBasicBlock *BB, const BlockChain &Chain,
462  const BlockFilterSet *BlockFilter,
464  bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
465  BlockFilterSet *BlockFilter);
466  void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
468  BlockFilterSet *BlockFilter);
469  bool repeatedlyTailDuplicateBlock(
471  const MachineBasicBlock *LoopHeaderBB,
472  BlockChain &Chain, BlockFilterSet *BlockFilter,
473  MachineFunction::iterator &PrevUnplacedBlockIt);
474  bool maybeTailDuplicateBlock(
476  BlockChain &Chain, BlockFilterSet *BlockFilter,
477  MachineFunction::iterator &PrevUnplacedBlockIt,
478  bool &DuplicatedToLPred);
479  bool hasBetterLayoutPredecessor(
480  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
481  const BlockChain &SuccChain, BranchProbability SuccProb,
482  BranchProbability RealSuccProb, const BlockChain &Chain,
483  const BlockFilterSet *BlockFilter);
484  BlockAndTailDupResult selectBestSuccessor(
485  const MachineBasicBlock *BB, const BlockChain &Chain,
486  const BlockFilterSet *BlockFilter);
487  MachineBasicBlock *selectBestCandidateBlock(
488  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
489  MachineBasicBlock *getFirstUnplacedBlock(
490  const BlockChain &PlacedChain,
491  MachineFunction::iterator &PrevUnplacedBlockIt,
492  const BlockFilterSet *BlockFilter);
493 
494  /// Add a basic block to the work list if it is appropriate.
495  ///
496  /// If the optional parameter BlockFilter is provided, only MBB
497  /// present in the set will be added to the worklist. If nullptr
498  /// is provided, no filtering occurs.
499  void fillWorkLists(const MachineBasicBlock *MBB,
500  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
501  const BlockFilterSet *BlockFilter);
502 
503  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
504  BlockFilterSet *BlockFilter = nullptr);
505  bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
506  const MachineBasicBlock *OldTop);
507  bool hasViableTopFallthrough(const MachineBasicBlock *Top,
508  const BlockFilterSet &LoopBlockSet);
509  BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
510  const BlockFilterSet &LoopBlockSet);
511  BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
512  const MachineBasicBlock *OldTop,
513  const MachineBasicBlock *ExitBB,
514  const BlockFilterSet &LoopBlockSet);
515  MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
516  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
517  MachineBasicBlock *findBestLoopTop(
518  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
519  MachineBasicBlock *findBestLoopExit(
520  const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
521  BlockFrequency &ExitFreq);
522  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
523  void buildLoopChains(const MachineLoop &L);
524  void rotateLoop(
525  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
526  BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
527  void rotateLoopWithProfile(
528  BlockChain &LoopChain, const MachineLoop &L,
529  const BlockFilterSet &LoopBlockSet);
530  void buildCFGChains();
531  void optimizeBranches();
532  void alignBlocks();
533  /// Returns true if a block should be tail-duplicated to increase fallthrough
534  /// opportunities.
535  bool shouldTailDuplicate(MachineBasicBlock *BB);
536  /// Check the edge frequencies to see if tail duplication will increase
537  /// fallthroughs.
538  bool isProfitableToTailDup(
539  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
540  BranchProbability QProb,
541  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
542 
543  /// Check for a trellis layout.
544  bool isTrellis(const MachineBasicBlock *BB,
545  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
546  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
547 
548  /// Get the best successor given a trellis layout.
549  BlockAndTailDupResult getBestTrellisSuccessor(
550  const MachineBasicBlock *BB,
551  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
552  BranchProbability AdjustedSumProb, const BlockChain &Chain,
553  const BlockFilterSet *BlockFilter);
554 
555  /// Get the best pair of non-conflicting edges.
556  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
557  const MachineBasicBlock *BB,
559 
560  /// Returns true if a block can tail duplicate into all unplaced
561  /// predecessors. Filters based on loop.
562  bool canTailDuplicateUnplacedPreds(
563  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
564  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
565 
566  /// Find chains of triangles to tail-duplicate where a global analysis works,
567  /// but a local analysis would not find them.
568  void precomputeTriangleChains();
569 
570  /// Apply a post-processing step optimizing block placement.
571  void applyExtTsp();
572 
573  /// Modify the existing block placement in the function and adjust all jumps.
574  void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
575 
576  /// Create a single CFG chain from the current block order.
577  void createCFGChainExtTsp();
578 
579 public:
580  static char ID; // Pass identification, replacement for typeid
581 
582  MachineBlockPlacement() : MachineFunctionPass(ID) {
584  }
585 
586  bool runOnMachineFunction(MachineFunction &F) override;
587 
588  bool allowTailDupPlacement() const {
589  assert(F);
590  return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
591  }
592 
593  void getAnalysisUsage(AnalysisUsage &AU) const override {
596  if (TailDupPlacement)
602  }
603 };
604 
605 } // end anonymous namespace
606 
608 
610 
611 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
612  "Branch Probability Basic Block Placement", false, false)
618 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
619  "Branch Probability Basic Block Placement", false, false)
620 
621 #ifndef NDEBUG
622 /// Helper to print the name of a MBB.
623 ///
624 /// Only used by debug logging.
625 static std::string getBlockName(const MachineBasicBlock *BB) {
626  std::string Result;
627  raw_string_ostream OS(Result);
628  OS << printMBBReference(*BB);
629  OS << " ('" << BB->getName() << "')";
630  OS.flush();
631  return Result;
632 }
633 #endif
634 
635 /// Mark a chain's successors as having one fewer preds.
636 ///
637 /// When a chain is being merged into the "placed" chain, this routine will
638 /// quickly walk the successors of each block in the chain and mark them as
639 /// having one fewer active predecessor. It also adds any successors of this
640 /// chain which reach the zero-predecessor state to the appropriate worklist.
641 void MachineBlockPlacement::markChainSuccessors(
642  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
643  const BlockFilterSet *BlockFilter) {
644  // Walk all the blocks in this chain, marking their successors as having
645  // a predecessor placed.
646  for (MachineBasicBlock *MBB : Chain) {
647  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
648  }
649 }
650 
651 /// Mark a single block's successors as having one fewer preds.
652 ///
653 /// Under normal circumstances, this is only called by markChainSuccessors,
654 /// but if a block that was to be placed is completely tail-duplicated away,
655 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
656 /// for just that block.
657 void MachineBlockPlacement::markBlockSuccessors(
658  const BlockChain &Chain, const MachineBasicBlock *MBB,
659  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
660  // Add any successors for which this is the only un-placed in-loop
661  // predecessor to the worklist as a viable candidate for CFG-neutral
662  // placement. No subsequent placement of this block will violate the CFG
663  // shape, so we get to use heuristics to choose a favorable placement.
664  for (MachineBasicBlock *Succ : MBB->successors()) {
665  if (BlockFilter && !BlockFilter->count(Succ))
666  continue;
667  BlockChain &SuccChain = *BlockToChain[Succ];
668  // Disregard edges within a fixed chain, or edges to the loop header.
669  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
670  continue;
671 
672  // This is a cross-chain edge that is within the loop, so decrement the
673  // loop predecessor count of the destination chain.
674  if (SuccChain.UnscheduledPredecessors == 0 ||
675  --SuccChain.UnscheduledPredecessors > 0)
676  continue;
677 
678  auto *NewBB = *SuccChain.begin();
679  if (NewBB->isEHPad())
680  EHPadWorkList.push_back(NewBB);
681  else
682  BlockWorkList.push_back(NewBB);
683  }
684 }
685 
686 /// This helper function collects the set of successors of block
687 /// \p BB that are allowed to be its layout successors, and return
688 /// the total branch probability of edges from \p BB to those
689 /// blocks.
690 BranchProbability MachineBlockPlacement::collectViableSuccessors(
691  const MachineBasicBlock *BB, const BlockChain &Chain,
692  const BlockFilterSet *BlockFilter,
694  // Adjust edge probabilities by excluding edges pointing to blocks that is
695  // either not in BlockFilter or is already in the current chain. Consider the
696  // following CFG:
697  //
698  // --->A
699  // | / \
700  // | B C
701  // | \ / \
702  // ----D E
703  //
704  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
705  // A->C is chosen as a fall-through, D won't be selected as a successor of C
706  // due to CFG constraint (the probability of C->D is not greater than
707  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
708  // when calculating the probability of C->D, D will be selected and we
709  // will get A C D B as the layout of this loop.
710  auto AdjustedSumProb = BranchProbability::getOne();
711  for (MachineBasicBlock *Succ : BB->successors()) {
712  bool SkipSucc = false;
713  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
714  SkipSucc = true;
715  } else {
716  BlockChain *SuccChain = BlockToChain[Succ];
717  if (SuccChain == &Chain) {
718  SkipSucc = true;
719  } else if (Succ != *SuccChain->begin()) {
720  LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
721  << " -> Mid chain!\n");
722  continue;
723  }
724  }
725  if (SkipSucc)
726  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
727  else
728  Successors.push_back(Succ);
729  }
730 
731  return AdjustedSumProb;
732 }
733 
734 /// The helper function returns the branch probability that is adjusted
735 /// or normalized over the new total \p AdjustedSumProb.
736 static BranchProbability
738  BranchProbability AdjustedSumProb) {
739  BranchProbability SuccProb;
740  uint32_t SuccProbN = OrigProb.getNumerator();
741  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
742  if (SuccProbN >= SuccProbD)
743  SuccProb = BranchProbability::getOne();
744  else
745  SuccProb = BranchProbability(SuccProbN, SuccProbD);
746 
747  return SuccProb;
748 }
749 
750 /// Check if \p BB has exactly the successors in \p Successors.
751 static bool
754  if (BB.succ_size() != Successors.size())
755  return false;
756  // We don't want to count self-loops
757  if (Successors.count(&BB))
758  return false;
759  for (MachineBasicBlock *Succ : BB.successors())
760  if (!Successors.count(Succ))
761  return false;
762  return true;
763 }
764 
765 /// Check if a block should be tail duplicated to increase fallthrough
766 /// opportunities.
767 /// \p BB Block to check.
768 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
769  // Blocks with single successors don't create additional fallthrough
770  // opportunities. Don't duplicate them. TODO: When conditional exits are
771  // analyzable, allow them to be duplicated.
772  bool IsSimple = TailDup.isSimpleBB(BB);
773 
774  if (BB->succ_size() == 1)
775  return false;
776  return TailDup.shouldTailDuplicate(IsSimple, *BB);
777 }
778 
779 /// Compare 2 BlockFrequency's with a small penalty for \p A.
780 /// In order to be conservative, we apply a X% penalty to account for
781 /// increased icache pressure and static heuristics. For small frequencies
782 /// we use only the numerators to improve accuracy. For simplicity, we assume the
783 /// penalty is less than 100%
784 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
786  uint64_t EntryFreq) {
787  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
788  BlockFrequency Gain = A - B;
789  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
790 }
791 
792 /// Check the edge frequencies to see if tail duplication will increase
793 /// fallthroughs. It only makes sense to call this function when
794 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
795 /// always locally profitable if we would have picked \p Succ without
796 /// considering duplication.
797 bool MachineBlockPlacement::isProfitableToTailDup(
798  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
799  BranchProbability QProb,
800  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
801  // We need to do a probability calculation to make sure this is profitable.
802  // First: does succ have a successor that post-dominates? This affects the
803  // calculation. The 2 relevant cases are:
804  // BB BB
805  // | \Qout | \Qout
806  // P| C |P C
807  // = C' = C'
808  // | /Qin | /Qin
809  // | / | /
810  // Succ Succ
811  // / \ | \ V
812  // U/ =V |U \
813  // / \ = D
814  // D E | /
815  // | /
816  // |/
817  // PDom
818  // '=' : Branch taken for that CFG edge
819  // In the second case, Placing Succ while duplicating it into C prevents the
820  // fallthrough of Succ into either D or PDom, because they now have C as an
821  // unplaced predecessor
822 
823  // Start by figuring out which case we fall into
824  MachineBasicBlock *PDom = nullptr;
826  // Only scan the relevant successors
827  auto AdjustedSuccSumProb =
828  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
829  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
830  auto BBFreq = MBFI->getBlockFreq(BB);
831  auto SuccFreq = MBFI->getBlockFreq(Succ);
832  BlockFrequency P = BBFreq * PProb;
833  BlockFrequency Qout = BBFreq * QProb;
834  uint64_t EntryFreq = MBFI->getEntryFreq();
835  // If there are no more successors, it is profitable to copy, as it strictly
836  // increases fallthrough.
837  if (SuccSuccs.size() == 0)
838  return greaterWithBias(P, Qout, EntryFreq);
839 
840  auto BestSuccSucc = BranchProbability::getZero();
841  // Find the PDom or the best Succ if no PDom exists.
842  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
843  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
844  if (Prob > BestSuccSucc)
845  BestSuccSucc = Prob;
846  if (PDom == nullptr)
847  if (MPDT->dominates(SuccSucc, Succ)) {
848  PDom = SuccSucc;
849  break;
850  }
851  }
852  // For the comparisons, we need to know Succ's best incoming edge that isn't
853  // from BB.
854  auto SuccBestPred = BlockFrequency(0);
855  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
856  if (SuccPred == Succ || SuccPred == BB
857  || BlockToChain[SuccPred] == &Chain
858  || (BlockFilter && !BlockFilter->count(SuccPred)))
859  continue;
860  auto Freq = MBFI->getBlockFreq(SuccPred)
861  * MBPI->getEdgeProbability(SuccPred, Succ);
862  if (Freq > SuccBestPred)
863  SuccBestPred = Freq;
864  }
865  // Qin is Succ's best unplaced incoming edge that isn't BB
866  BlockFrequency Qin = SuccBestPred;
867  // If it doesn't have a post-dominating successor, here is the calculation:
868  // BB BB
869  // | \Qout | \
870  // P| C | =
871  // = C' | C
872  // | /Qin | |
873  // | / | C' (+Succ)
874  // Succ Succ /|
875  // / \ | \/ |
876  // U/ =V | == |
877  // / \ | / \|
878  // D E D E
879  // '=' : Branch taken for that CFG edge
880  // Cost in the first case is: P + V
881  // For this calculation, we always assume P > Qout. If Qout > P
882  // The result of this function will be ignored at the caller.
883  // Let F = SuccFreq - Qin
884  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
885 
886  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
887  BranchProbability UProb = BestSuccSucc;
888  BranchProbability VProb = AdjustedSuccSumProb - UProb;
889  BlockFrequency F = SuccFreq - Qin;
890  BlockFrequency V = SuccFreq * VProb;
891  BlockFrequency QinU = std::min(Qin, F) * UProb;
892  BlockFrequency BaseCost = P + V;
893  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
894  return greaterWithBias(BaseCost, DupCost, EntryFreq);
895  }
896  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
897  BranchProbability VProb = AdjustedSuccSumProb - UProb;
898  BlockFrequency U = SuccFreq * UProb;
899  BlockFrequency V = SuccFreq * VProb;
900  BlockFrequency F = SuccFreq - Qin;
901  // If there is a post-dominating successor, here is the calculation:
902  // BB BB BB BB
903  // | \Qout | \ | \Qout | \
904  // |P C | = |P C | =
905  // = C' |P C = C' |P C
906  // | /Qin | | | /Qin | |
907  // | / | C' (+Succ) | / | C' (+Succ)
908  // Succ Succ /| Succ Succ /|
909  // | \ V | \/ | | \ V | \/ |
910  // |U \ |U /\ =? |U = |U /\ |
911  // = D = = =?| | D | = =|
912  // | / |/ D | / |/ D
913  // | / | / | = | /
914  // |/ | / |/ | =
915  // Dom Dom Dom Dom
916  // '=' : Branch taken for that CFG edge
917  // The cost for taken branches in the first case is P + U
918  // Let F = SuccFreq - Qin
919  // The cost in the second case (assuming independence), given the layout:
920  // BB, Succ, (C+Succ), D, Dom or the layout:
921  // BB, Succ, D, Dom, (C+Succ)
922  // is Qout + max(F, Qin) * U + min(F, Qin)
923  // compare P + U vs Qout + P * U + Qin.
924  //
925  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
926  //
927  // For the 3rd case, the cost is P + 2 * V
928  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
929  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
930  if (UProb > AdjustedSuccSumProb / 2 &&
931  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
932  Chain, BlockFilter))
933  // Cases 3 & 4
934  return greaterWithBias(
935  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
936  EntryFreq);
937  // Cases 1 & 2
938  return greaterWithBias((P + U),
939  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
940  std::max(Qin, F) * UProb),
941  EntryFreq);
942 }
943 
944 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
945 /// successors form the lower part of a trellis. A successor set S forms the
946 /// lower part of a trellis if all of the predecessors of S are either in S or
947 /// have all of S as successors. We ignore trellises where BB doesn't have 2
948 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
949 /// are very uncommon and complex to compute optimally. Allowing edges within S
950 /// is not strictly a trellis, but the same algorithm works, so we allow it.
951 bool MachineBlockPlacement::isTrellis(
952  const MachineBasicBlock *BB,
953  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
954  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
955  // Technically BB could form a trellis with branching factor higher than 2.
956  // But that's extremely uncommon.
957  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
958  return false;
959 
960  SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
961  BB->succ_end());
962  // To avoid reviewing the same predecessors twice.
964 
965  for (MachineBasicBlock *Succ : ViableSuccs) {
966  int PredCount = 0;
967  for (auto SuccPred : Succ->predecessors()) {
968  // Allow triangle successors, but don't count them.
969  if (Successors.count(SuccPred)) {
970  // Make sure that it is actually a triangle.
971  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
972  if (!Successors.count(CheckSucc))
973  return false;
974  continue;
975  }
976  const BlockChain *PredChain = BlockToChain[SuccPred];
977  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
978  PredChain == &Chain || PredChain == BlockToChain[Succ])
979  continue;
980  ++PredCount;
981  // Perform the successor check only once.
982  if (!SeenPreds.insert(SuccPred).second)
983  continue;
984  if (!hasSameSuccessors(*SuccPred, Successors))
985  return false;
986  }
987  // If one of the successors has only BB as a predecessor, it is not a
988  // trellis.
989  if (PredCount < 1)
990  return false;
991  }
992  return true;
993 }
994 
995 /// Pick the highest total weight pair of edges that can both be laid out.
996 /// The edges in \p Edges[0] are assumed to have a different destination than
997 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
998 /// the individual highest weight edges to the 2 different destinations, or in
999 /// case of a conflict, one of them should be replaced with a 2nd best edge.
1000 std::pair<MachineBlockPlacement::WeightedEdge,
1001  MachineBlockPlacement::WeightedEdge>
1002 MachineBlockPlacement::getBestNonConflictingEdges(
1003  const MachineBasicBlock *BB,
1005  Edges) {
1006  // Sort the edges, and then for each successor, find the best incoming
1007  // predecessor. If the best incoming predecessors aren't the same,
1008  // then that is clearly the best layout. If there is a conflict, one of the
1009  // successors will have to fallthrough from the second best predecessor. We
1010  // compare which combination is better overall.
1011 
1012  // Sort for highest frequency.
1013  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1014 
1015  llvm::stable_sort(Edges[0], Cmp);
1016  llvm::stable_sort(Edges[1], Cmp);
1017  auto BestA = Edges[0].begin();
1018  auto BestB = Edges[1].begin();
1019  // Arrange for the correct answer to be in BestA and BestB
1020  // If the 2 best edges don't conflict, the answer is already there.
1021  if (BestA->Src == BestB->Src) {
1022  // Compare the total fallthrough of (Best + Second Best) for both pairs
1023  auto SecondBestA = std::next(BestA);
1024  auto SecondBestB = std::next(BestB);
1025  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1026  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1027  if (BestAScore < BestBScore)
1028  BestA = SecondBestA;
1029  else
1030  BestB = SecondBestB;
1031  }
1032  // Arrange for the BB edge to be in BestA if it exists.
1033  if (BestB->Src == BB)
1034  std::swap(BestA, BestB);
1035  return std::make_pair(*BestA, *BestB);
1036 }
1037 
1038 /// Get the best successor from \p BB based on \p BB being part of a trellis.
1039 /// We only handle trellises with 2 successors, so the algorithm is
1040 /// straightforward: Find the best pair of edges that don't conflict. We find
1041 /// the best incoming edge for each successor in the trellis. If those conflict,
1042 /// we consider which of them should be replaced with the second best.
1043 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
1044 /// comes from \p BB, it will be in \p BestEdges[0]
1045 MachineBlockPlacement::BlockAndTailDupResult
1046 MachineBlockPlacement::getBestTrellisSuccessor(
1047  const MachineBasicBlock *BB,
1048  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1049  BranchProbability AdjustedSumProb, const BlockChain &Chain,
1050  const BlockFilterSet *BlockFilter) {
1051 
1052  BlockAndTailDupResult Result = {nullptr, false};
1053  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
1054  BB->succ_end());
1055 
1056  // We assume size 2 because it's common. For general n, we would have to do
1057  // the Hungarian algorithm, but it's not worth the complexity because more
1058  // than 2 successors is fairly uncommon, and a trellis even more so.
1059  if (Successors.size() != 2 || ViableSuccs.size() != 2)
1060  return Result;
1061 
1062  // Collect the edge frequencies of all edges that form the trellis.
1064  int SuccIndex = 0;
1065  for (auto Succ : ViableSuccs) {
1066  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1067  // Skip any placed predecessors that are not BB
1068  if (SuccPred != BB)
1069  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1070  BlockToChain[SuccPred] == &Chain ||
1071  BlockToChain[SuccPred] == BlockToChain[Succ])
1072  continue;
1073  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1074  MBPI->getEdgeProbability(SuccPred, Succ);
1075  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1076  }
1077  ++SuccIndex;
1078  }
1079 
1080  // Pick the best combination of 2 edges from all the edges in the trellis.
1081  WeightedEdge BestA, BestB;
1082  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1083 
1084  if (BestA.Src != BB) {
1085  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1086  // we shouldn't choose any successor. We've already looked and there's a
1087  // better fallthrough edge for all the successors.
1088  LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1089  return Result;
1090  }
1091 
1092  // Did we pick the triangle edge? If tail-duplication is profitable, do
1093  // that instead. Otherwise merge the triangle edge now while we know it is
1094  // optimal.
1095  if (BestA.Dest == BestB.Src) {
1096  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1097  // would be better.
1098  MachineBasicBlock *Succ1 = BestA.Dest;
1099  MachineBasicBlock *Succ2 = BestB.Dest;
1100  // Check to see if tail-duplication would be profitable.
1101  if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1102  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1103  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1104  Chain, BlockFilter)) {
1106  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1107  dbgs() << " Selected: " << getBlockName(Succ2)
1108  << ", probability: " << Succ2Prob
1109  << " (Tail Duplicate)\n");
1110  Result.BB = Succ2;
1111  Result.ShouldTailDup = true;
1112  return Result;
1113  }
1114  }
1115  // We have already computed the optimal edge for the other side of the
1116  // trellis.
1117  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1118 
1119  auto TrellisSucc = BestA.Dest;
1121  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1122  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1123  << ", probability: " << SuccProb << " (Trellis)\n");
1124  Result.BB = TrellisSucc;
1125  return Result;
1126 }
1127 
1128 /// When the option allowTailDupPlacement() is on, this method checks if the
1129 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1130 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1131 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1132  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1133  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1134  if (!shouldTailDuplicate(Succ))
1135  return false;
1136 
1137  // The result of canTailDuplicate.
1138  bool Duplicate = true;
1139  // Number of possible duplication.
1140  unsigned int NumDup = 0;
1141 
1142  // For CFG checking.
1143  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
1144  BB->succ_end());
1145  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1146  // Make sure all unplaced and unfiltered predecessors can be
1147  // tail-duplicated into.
1148  // Skip any blocks that are already placed or not in this loop.
1149  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1150  || BlockToChain[Pred] == &Chain)
1151  continue;
1152  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1153  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1154  // This will result in a trellis after tail duplication, so we don't
1155  // need to copy Succ into this predecessor. In the presence
1156  // of a trellis tail duplication can continue to be profitable.
1157  // For example:
1158  // A A
1159  // |\ |\
1160  // | \ | \
1161  // | C | C+BB
1162  // | / | |
1163  // |/ | |
1164  // BB => BB |
1165  // |\ |\/|
1166  // | \ |/\|
1167  // | D | D
1168  // | / | /
1169  // |/ |/
1170  // Succ Succ
1171  //
1172  // After BB was duplicated into C, the layout looks like the one on the
1173  // right. BB and C now have the same successors. When considering
1174  // whether Succ can be duplicated into all its unplaced predecessors, we
1175  // ignore C.
1176  // We can do this because C already has a profitable fallthrough, namely
1177  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1178  // duplication and for this test.
1179  //
1180  // This allows trellises to be laid out in 2 separate chains
1181  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1182  // because it allows the creation of 2 fallthrough paths with links
1183  // between them, and we correctly identify the best layout for these
1184  // CFGs. We want to extend trellises that the user created in addition
1185  // to trellises created by tail-duplication, so we just look for the
1186  // CFG.
1187  continue;
1188  Duplicate = false;
1189  continue;
1190  }
1191  NumDup++;
1192  }
1193 
1194  // No possible duplication in current filter set.
1195  if (NumDup == 0)
1196  return false;
1197 
1198  // If profile information is available, findDuplicateCandidates can do more
1199  // precise benefit analysis.
1200  if (F->getFunction().hasProfileData())
1201  return true;
1202 
1203  // This is mainly for function exit BB.
1204  // The integrated tail duplication is really designed for increasing
1205  // fallthrough from predecessors from Succ to its successors. We may need
1206  // other machanism to handle different cases.
1207  if (Succ->succ_empty())
1208  return true;
1209 
1210  // Plus the already placed predecessor.
1211  NumDup++;
1212 
1213  // If the duplication candidate has more unplaced predecessors than
1214  // successors, the extra duplication can't bring more fallthrough.
1215  //
1216  // Pred1 Pred2 Pred3
1217  // \ | /
1218  // \ | /
1219  // \ | /
1220  // Dup
1221  // / \
1222  // / \
1223  // Succ1 Succ2
1224  //
1225  // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1226  // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1227  // but the duplication into Pred3 can't increase fallthrough.
1228  //
1229  // A small number of extra duplication may not hurt too much. We need a better
1230  // heuristic to handle it.
1231  if ((NumDup > Succ->succ_size()) || !Duplicate)
1232  return false;
1233 
1234  return true;
1235 }
1236 
1237 /// Find chains of triangles where we believe it would be profitable to
1238 /// tail-duplicate them all, but a local analysis would not find them.
1239 /// There are 3 ways this can be profitable:
1240 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1241 /// longer chains)
1242 /// 2) The chains are statically correlated. Branch probabilities have a very
1243 /// U-shaped distribution.
1244 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1245 /// If the branches in a chain are likely to be from the same side of the
1246 /// distribution as their predecessor, but are independent at runtime, this
1247 /// transformation is profitable. (Because the cost of being wrong is a small
1248 /// fixed cost, unlike the standard triangle layout where the cost of being
1249 /// wrong scales with the # of triangles.)
1250 /// 3) The chains are dynamically correlated. If the probability that a previous
1251 /// branch was taken positively influences whether the next branch will be
1252 /// taken
1253 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1254 void MachineBlockPlacement::precomputeTriangleChains() {
1255  struct TriangleChain {
1256  std::vector<MachineBasicBlock *> Edges;
1257 
1258  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1259  : Edges({src, dst}) {}
1260 
1261  void append(MachineBasicBlock *dst) {
1262  assert(getKey()->isSuccessor(dst) &&
1263  "Attempting to append a block that is not a successor.");
1264  Edges.push_back(dst);
1265  }
1266 
1267  unsigned count() const { return Edges.size() - 1; }
1268 
1269  MachineBasicBlock *getKey() const {
1270  return Edges.back();
1271  }
1272  };
1273 
1274  if (TriangleChainCount == 0)
1275  return;
1276 
1277  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1278  // Map from last block to the chain that contains it. This allows us to extend
1279  // chains as we find new triangles.
1281  for (MachineBasicBlock &BB : *F) {
1282  // If BB doesn't have 2 successors, it doesn't start a triangle.
1283  if (BB.succ_size() != 2)
1284  continue;
1285  MachineBasicBlock *PDom = nullptr;
1286  for (MachineBasicBlock *Succ : BB.successors()) {
1287  if (!MPDT->dominates(Succ, &BB))
1288  continue;
1289  PDom = Succ;
1290  break;
1291  }
1292  // If BB doesn't have a post-dominating successor, it doesn't form a
1293  // triangle.
1294  if (PDom == nullptr)
1295  continue;
1296  // If PDom has a hint that it is low probability, skip this triangle.
1297  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1298  continue;
1299  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1300  // we're looking for.
1301  if (!shouldTailDuplicate(PDom))
1302  continue;
1303  bool CanTailDuplicate = true;
1304  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1305  // isn't the kind of triangle we're looking for.
1306  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1307  if (Pred == &BB)
1308  continue;
1309  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1310  CanTailDuplicate = false;
1311  break;
1312  }
1313  }
1314  // If we can't tail-duplicate PDom to its predecessors, then skip this
1315  // triangle.
1316  if (!CanTailDuplicate)
1317  continue;
1318 
1319  // Now we have an interesting triangle. Insert it if it's not part of an
1320  // existing chain.
1321  // Note: This cannot be replaced with a call insert() or emplace() because
1322  // the find key is BB, but the insert/emplace key is PDom.
1323  auto Found = TriangleChainMap.find(&BB);
1324  // If it is, remove the chain from the map, grow it, and put it back in the
1325  // map with the end as the new key.
1326  if (Found != TriangleChainMap.end()) {
1327  TriangleChain Chain = std::move(Found->second);
1328  TriangleChainMap.erase(Found);
1329  Chain.append(PDom);
1330  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1331  } else {
1332  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1333  assert(InsertResult.second && "Block seen twice.");
1334  (void)InsertResult;
1335  }
1336  }
1337 
1338  // Iterating over a DenseMap is safe here, because the only thing in the body
1339  // of the loop is inserting into another DenseMap (ComputedEdges).
1340  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1341  for (auto &ChainPair : TriangleChainMap) {
1342  TriangleChain &Chain = ChainPair.second;
1343  // Benchmarking has shown that due to branch correlation duplicating 2 or
1344  // more triangles is profitable, despite the calculations assuming
1345  // independence.
1346  if (Chain.count() < TriangleChainCount)
1347  continue;
1348  MachineBasicBlock *dst = Chain.Edges.back();
1349  Chain.Edges.pop_back();
1350  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1351  LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1352  << getBlockName(dst)
1353  << " as pre-computed based on triangles.\n");
1354 
1355  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1356  assert(InsertResult.second && "Block seen twice.");
1357  (void)InsertResult;
1358 
1359  dst = src;
1360  }
1361  }
1362 }
1363 
1364 // When profile is not present, return the StaticLikelyProb.
1365 // When profile is available, we need to handle the triangle-shape CFG.
1367  const MachineBasicBlock *BB) {
1368  if (!BB->getParent()->getFunction().hasProfileData())
1369  return BranchProbability(StaticLikelyProb, 100);
1370  if (BB->succ_size() == 2) {
1371  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1372  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1373  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1374  /* See case 1 below for the cost analysis. For BB->Succ to
1375  * be taken with smaller cost, the following needs to hold:
1376  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1377  * So the threshold T in the calculation below
1378  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1379  * So T / (1 - T) = 2, Yielding T = 2/3
1380  * Also adding user specified branch bias, we have
1381  * T = (2/3)*(ProfileLikelyProb/50)
1382  * = (2*ProfileLikelyProb)/150)
1383  */
1384  return BranchProbability(2 * ProfileLikelyProb, 150);
1385  }
1386  }
1387  return BranchProbability(ProfileLikelyProb, 100);
1388 }
1389 
1390 /// Checks to see if the layout candidate block \p Succ has a better layout
1391 /// predecessor than \c BB. If yes, returns true.
1392 /// \p SuccProb: The probability adjusted for only remaining blocks.
1393 /// Only used for logging
1394 /// \p RealSuccProb: The un-adjusted probability.
1395 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1396 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1397 /// considered
1398 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1399  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1400  const BlockChain &SuccChain, BranchProbability SuccProb,
1401  BranchProbability RealSuccProb, const BlockChain &Chain,
1402  const BlockFilterSet *BlockFilter) {
1403 
1404  // There isn't a better layout when there are no unscheduled predecessors.
1405  if (SuccChain.UnscheduledPredecessors == 0)
1406  return false;
1407 
1408  // There are two basic scenarios here:
1409  // -------------------------------------
1410  // Case 1: triangular shape CFG (if-then):
1411  // BB
1412  // | \
1413  // | \
1414  // | Pred
1415  // | /
1416  // Succ
1417  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1418  // set Succ as the layout successor of BB. Picking Succ as BB's
1419  // successor breaks the CFG constraints (FIXME: define these constraints).
1420  // With this layout, Pred BB
1421  // is forced to be outlined, so the overall cost will be cost of the
1422  // branch taken from BB to Pred, plus the cost of back taken branch
1423  // from Pred to Succ, as well as the additional cost associated
1424  // with the needed unconditional jump instruction from Pred To Succ.
1425 
1426  // The cost of the topological order layout is the taken branch cost
1427  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1428  // must hold:
1429  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1430  // < freq(BB->Succ) * taken_branch_cost.
1431  // Ignoring unconditional jump cost, we get
1432  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1433  // prob(BB->Succ) > 2 * prob(BB->Pred)
1434  //
1435  // When real profile data is available, we can precisely compute the
1436  // probability threshold that is needed for edge BB->Succ to be considered.
1437  // Without profile data, the heuristic requires the branch bias to be
1438  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1439  // -----------------------------------------------------------------
1440  // Case 2: diamond like CFG (if-then-else):
1441  // S
1442  // / \
1443  // | \
1444  // BB Pred
1445  // \ /
1446  // Succ
1447  // ..
1448  //
1449  // The current block is BB and edge BB->Succ is now being evaluated.
1450  // Note that edge S->BB was previously already selected because
1451  // prob(S->BB) > prob(S->Pred).
1452  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1453  // choose Pred, we will have a topological ordering as shown on the left
1454  // in the picture below. If we choose Succ, we have the solution as shown
1455  // on the right:
1456  //
1457  // topo-order:
1458  //
1459  // S----- ---S
1460  // | | | |
1461  // ---BB | | BB
1462  // | | | |
1463  // | Pred-- | Succ--
1464  // | | | |
1465  // ---Succ ---Pred--
1466  //
1467  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1468  // = freq(S->Pred) + freq(S->BB)
1469  //
1470  // If we have profile data (i.e, branch probabilities can be trusted), the
1471  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1472  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1473  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1474  // means the cost of topological order is greater.
1475  // When profile data is not available, however, we need to be more
1476  // conservative. If the branch prediction is wrong, breaking the topo-order
1477  // will actually yield a layout with large cost. For this reason, we need
1478  // strong biased branch at block S with Prob(S->BB) in order to select
1479  // BB->Succ. This is equivalent to looking the CFG backward with backward
1480  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1481  // profile data).
1482  // --------------------------------------------------------------------------
1483  // Case 3: forked diamond
1484  // S
1485  // / \
1486  // / \
1487  // BB Pred
1488  // | \ / |
1489  // | \ / |
1490  // | X |
1491  // | / \ |
1492  // | / \ |
1493  // S1 S2
1494  //
1495  // The current block is BB and edge BB->S1 is now being evaluated.
1496  // As above S->BB was already selected because
1497  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1498  //
1499  // topo-order:
1500  //
1501  // S-------| ---S
1502  // | | | |
1503  // ---BB | | BB
1504  // | | | |
1505  // | Pred----| | S1----
1506  // | | | |
1507  // --(S1 or S2) ---Pred--
1508  // |
1509  // S2
1510  //
1511  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1512  // + min(freq(Pred->S1), freq(Pred->S2))
1513  // Non-topo-order cost:
1514  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1515  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1516  // is 0. Then the non topo layout is better when
1517  // freq(S->Pred) < freq(BB->S1).
1518  // This is exactly what is checked below.
1519  // Note there are other shapes that apply (Pred may not be a single block,
1520  // but they all fit this general pattern.)
1522 
1523  // Make sure that a hot successor doesn't have a globally more
1524  // important predecessor.
1525  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1526  bool BadCFGConflict = false;
1527 
1528  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1529  BlockChain *PredChain = BlockToChain[Pred];
1530  if (Pred == Succ || PredChain == &SuccChain ||
1531  (BlockFilter && !BlockFilter->count(Pred)) ||
1532  PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
1533  // This check is redundant except for look ahead. This function is
1534  // called for lookahead by isProfitableToTailDup when BB hasn't been
1535  // placed yet.
1536  (Pred == BB))
1537  continue;
1538  // Do backward checking.
1539  // For all cases above, we need a backward checking to filter out edges that
1540  // are not 'strongly' biased.
1541  // BB Pred
1542  // \ /
1543  // Succ
1544  // We select edge BB->Succ if
1545  // freq(BB->Succ) > freq(Succ) * HotProb
1546  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1547  // HotProb
1548  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1549  // Case 1 is covered too, because the first equation reduces to:
1550  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1551  BlockFrequency PredEdgeFreq =
1552  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1553  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1554  BadCFGConflict = true;
1555  break;
1556  }
1557  }
1558 
1559  if (BadCFGConflict) {
1560  LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1561  << SuccProb << " (prob) (non-cold CFG conflict)\n");
1562  return true;
1563  }
1564 
1565  return false;
1566 }
1567 
1568 /// Select the best successor for a block.
1569 ///
1570 /// This looks across all successors of a particular block and attempts to
1571 /// select the "best" one to be the layout successor. It only considers direct
1572 /// successors which also pass the block filter. It will attempt to avoid
1573 /// breaking CFG structure, but cave and break such structures in the case of
1574 /// very hot successor edges.
1575 ///
1576 /// \returns The best successor block found, or null if none are viable, along
1577 /// with a boolean indicating if tail duplication is necessary.
1578 MachineBlockPlacement::BlockAndTailDupResult
1579 MachineBlockPlacement::selectBestSuccessor(
1580  const MachineBasicBlock *BB, const BlockChain &Chain,
1581  const BlockFilterSet *BlockFilter) {
1582  const BranchProbability HotProb(StaticLikelyProb, 100);
1583 
1584  BlockAndTailDupResult BestSucc = { nullptr, false };
1585  auto BestProb = BranchProbability::getZero();
1586 
1588  auto AdjustedSumProb =
1589  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1590 
1591  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1592  << "\n");
1593 
1594  // if we already precomputed the best successor for BB, return that if still
1595  // applicable.
1596  auto FoundEdge = ComputedEdges.find(BB);
1597  if (FoundEdge != ComputedEdges.end()) {
1598  MachineBasicBlock *Succ = FoundEdge->second.BB;
1599  ComputedEdges.erase(FoundEdge);
1600  BlockChain *SuccChain = BlockToChain[Succ];
1601  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1602  SuccChain != &Chain && Succ == *SuccChain->begin())
1603  return FoundEdge->second;
1604  }
1605 
1606  // if BB is part of a trellis, Use the trellis to determine the optimal
1607  // fallthrough edges
1608  if (isTrellis(BB, Successors, Chain, BlockFilter))
1609  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1610  BlockFilter);
1611 
1612  // For blocks with CFG violations, we may be able to lay them out anyway with
1613  // tail-duplication. We keep this vector so we can perform the probability
1614  // calculations the minimum number of times.
1616  DupCandidates;
1617  for (MachineBasicBlock *Succ : Successors) {
1618  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1619  BranchProbability SuccProb =
1620  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1621 
1622  BlockChain &SuccChain = *BlockToChain[Succ];
1623  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1624  // predecessor that yields lower global cost.
1625  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1626  Chain, BlockFilter)) {
1627  // If tail duplication would make Succ profitable, place it.
1628  if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1629  DupCandidates.emplace_back(SuccProb, Succ);
1630  continue;
1631  }
1632 
1633  LLVM_DEBUG(
1634  dbgs() << " Candidate: " << getBlockName(Succ)
1635  << ", probability: " << SuccProb
1636  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1637  << "\n");
1638 
1639  if (BestSucc.BB && BestProb >= SuccProb) {
1640  LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1641  continue;
1642  }
1643 
1644  LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1645  BestSucc.BB = Succ;
1646  BestProb = SuccProb;
1647  }
1648  // Handle the tail duplication candidates in order of decreasing probability.
1649  // Stop at the first one that is profitable. Also stop if they are less
1650  // profitable than BestSucc. Position is important because we preserve it and
1651  // prefer first best match. Here we aren't comparing in order, so we capture
1652  // the position instead.
1653  llvm::stable_sort(DupCandidates,
1654  [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1655  std::tuple<BranchProbability, MachineBasicBlock *> R) {
1656  return std::get<0>(L) > std::get<0>(R);
1657  });
1658  for (auto &Tup : DupCandidates) {
1659  BranchProbability DupProb;
1660  MachineBasicBlock *Succ;
1661  std::tie(DupProb, Succ) = Tup;
1662  if (DupProb < BestProb)
1663  break;
1664  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1665  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1666  LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1667  << ", probability: " << DupProb
1668  << " (Tail Duplicate)\n");
1669  BestSucc.BB = Succ;
1670  BestSucc.ShouldTailDup = true;
1671  break;
1672  }
1673  }
1674 
1675  if (BestSucc.BB)
1676  LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1677 
1678  return BestSucc;
1679 }
1680 
1681 /// Select the best block from a worklist.
1682 ///
1683 /// This looks through the provided worklist as a list of candidate basic
1684 /// blocks and select the most profitable one to place. The definition of
1685 /// profitable only really makes sense in the context of a loop. This returns
1686 /// the most frequently visited block in the worklist, which in the case of
1687 /// a loop, is the one most desirable to be physically close to the rest of the
1688 /// loop body in order to improve i-cache behavior.
1689 ///
1690 /// \returns The best block found, or null if none are viable.
1691 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1692  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1693  // Once we need to walk the worklist looking for a candidate, cleanup the
1694  // worklist of already placed entries.
1695  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1696  // some code complexity) into the loop below.
1697  llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1698  return BlockToChain.lookup(BB) == &Chain;
1699  });
1700 
1701  if (WorkList.empty())
1702  return nullptr;
1703 
1704  bool IsEHPad = WorkList[0]->isEHPad();
1705 
1706  MachineBasicBlock *BestBlock = nullptr;
1707  BlockFrequency BestFreq;
1708  for (MachineBasicBlock *MBB : WorkList) {
1709  assert(MBB->isEHPad() == IsEHPad &&
1710  "EHPad mismatch between block and work list.");
1711 
1712  BlockChain &SuccChain = *BlockToChain[MBB];
1713  if (&SuccChain == &Chain)
1714  continue;
1715 
1716  assert(SuccChain.UnscheduledPredecessors == 0 &&
1717  "Found CFG-violating block");
1718 
1719  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1720  LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1721  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1722 
1723  // For ehpad, we layout the least probable first as to avoid jumping back
1724  // from least probable landingpads to more probable ones.
1725  //
1726  // FIXME: Using probability is probably (!) not the best way to achieve
1727  // this. We should probably have a more principled approach to layout
1728  // cleanup code.
1729  //
1730  // The goal is to get:
1731  //
1732  // +--------------------------+
1733  // | V
1734  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1735  //
1736  // Rather than:
1737  //
1738  // +-------------------------------------+
1739  // V |
1740  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1741  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1742  continue;
1743 
1744  BestBlock = MBB;
1745  BestFreq = CandidateFreq;
1746  }
1747 
1748  return BestBlock;
1749 }
1750 
1751 /// Retrieve the first unplaced basic block.
1752 ///
1753 /// This routine is called when we are unable to use the CFG to walk through
1754 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1755 /// We walk through the function's blocks in order, starting from the
1756 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1757 /// re-scanning the entire sequence on repeated calls to this routine.
1758 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1759  const BlockChain &PlacedChain,
1760  MachineFunction::iterator &PrevUnplacedBlockIt,
1761  const BlockFilterSet *BlockFilter) {
1762  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1763  ++I) {
1764  if (BlockFilter && !BlockFilter->count(&*I))
1765  continue;
1766  if (BlockToChain[&*I] != &PlacedChain) {
1767  PrevUnplacedBlockIt = I;
1768  // Now select the head of the chain to which the unplaced block belongs
1769  // as the block to place. This will force the entire chain to be placed,
1770  // and satisfies the requirements of merging chains.
1771  return *BlockToChain[&*I]->begin();
1772  }
1773  }
1774  return nullptr;
1775 }
1776 
1777 void MachineBlockPlacement::fillWorkLists(
1778  const MachineBasicBlock *MBB,
1779  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1780  const BlockFilterSet *BlockFilter = nullptr) {
1781  BlockChain &Chain = *BlockToChain[MBB];
1782  if (!UpdatedPreds.insert(&Chain).second)
1783  return;
1784 
1785  assert(
1786  Chain.UnscheduledPredecessors == 0 &&
1787  "Attempting to place block with unscheduled predecessors in worklist.");
1788  for (MachineBasicBlock *ChainBB : Chain) {
1789  assert(BlockToChain[ChainBB] == &Chain &&
1790  "Block in chain doesn't match BlockToChain map.");
1791  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1792  if (BlockFilter && !BlockFilter->count(Pred))
1793  continue;
1794  if (BlockToChain[Pred] == &Chain)
1795  continue;
1796  ++Chain.UnscheduledPredecessors;
1797  }
1798  }
1799 
1800  if (Chain.UnscheduledPredecessors != 0)
1801  return;
1802 
1803  MachineBasicBlock *BB = *Chain.begin();
1804  if (BB->isEHPad())
1805  EHPadWorkList.push_back(BB);
1806  else
1807  BlockWorkList.push_back(BB);
1808 }
1809 
1810 void MachineBlockPlacement::buildChain(
1811  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1812  BlockFilterSet *BlockFilter) {
1813  assert(HeadBB && "BB must not be null.\n");
1814  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1815  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1816 
1817  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1818  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1819  MachineBasicBlock *BB = *std::prev(Chain.end());
1820  while (true) {
1821  assert(BB && "null block found at end of chain in loop.");
1822  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1823  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1824 
1825 
1826  // Look for the best viable successor if there is one to place immediately
1827  // after this block.
1828  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1829  MachineBasicBlock* BestSucc = Result.BB;
1830  bool ShouldTailDup = Result.ShouldTailDup;
1831  if (allowTailDupPlacement())
1832  ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc,
1833  Chain,
1834  BlockFilter));
1835 
1836  // If an immediate successor isn't available, look for the best viable
1837  // block among those we've identified as not violating the loop's CFG at
1838  // this point. This won't be a fallthrough, but it will increase locality.
1839  if (!BestSucc)
1840  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1841  if (!BestSucc)
1842  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1843 
1844  if (!BestSucc) {
1845  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1846  if (!BestSucc)
1847  break;
1848 
1849  LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1850  "layout successor until the CFG reduces\n");
1851  }
1852 
1853  // Placement may have changed tail duplication opportunities.
1854  // Check for that now.
1855  if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1856  repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1857  BlockFilter, PrevUnplacedBlockIt);
1858  // If the chosen successor was duplicated into BB, don't bother laying
1859  // it out, just go round the loop again with BB as the chain end.
1860  if (!BB->isSuccessor(BestSucc))
1861  continue;
1862  }
1863 
1864  // Place this block, updating the datastructures to reflect its placement.
1865  BlockChain &SuccChain = *BlockToChain[BestSucc];
1866  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1867  // we selected a successor that didn't fit naturally into the CFG.
1868  SuccChain.UnscheduledPredecessors = 0;
1869  LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1870  << getBlockName(BestSucc) << "\n");
1871  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1872  Chain.merge(BestSucc, &SuccChain);
1873  BB = *std::prev(Chain.end());
1874  }
1875 
1876  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1877  << getBlockName(*Chain.begin()) << "\n");
1878 }
1879 
1880 // If bottom of block BB has only one successor OldTop, in most cases it is
1881 // profitable to move it before OldTop, except the following case:
1882 //
1883 // -->OldTop<-
1884 // | . |
1885 // | . |
1886 // | . |
1887 // ---Pred |
1888 // | |
1889 // BB-----
1890 //
1891 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1892 // layout the other successor below it, so it can't reduce taken branch.
1893 // In this case we keep its original layout.
1894 bool
1895 MachineBlockPlacement::canMoveBottomBlockToTop(
1896  const MachineBasicBlock *BottomBlock,
1897  const MachineBasicBlock *OldTop) {
1898  if (BottomBlock->pred_size() != 1)
1899  return true;
1900  MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1901  if (Pred->succ_size() != 2)
1902  return true;
1903 
1904  MachineBasicBlock *OtherBB = *Pred->succ_begin();
1905  if (OtherBB == BottomBlock)
1906  OtherBB = *Pred->succ_rbegin();
1907  if (OtherBB == OldTop)
1908  return false;
1909 
1910  return true;
1911 }
1912 
1913 // Find out the possible fall through frequence to the top of a loop.
1915 MachineBlockPlacement::TopFallThroughFreq(
1916  const MachineBasicBlock *Top,
1917  const BlockFilterSet &LoopBlockSet) {
1918  BlockFrequency MaxFreq = 0;
1919  for (MachineBasicBlock *Pred : Top->predecessors()) {
1920  BlockChain *PredChain = BlockToChain[Pred];
1921  if (!LoopBlockSet.count(Pred) &&
1922  (!PredChain || Pred == *std::prev(PredChain->end()))) {
1923  // Found a Pred block can be placed before Top.
1924  // Check if Top is the best successor of Pred.
1925  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1926  bool TopOK = true;
1927  for (MachineBasicBlock *Succ : Pred->successors()) {
1928  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1929  BlockChain *SuccChain = BlockToChain[Succ];
1930  // Check if Succ can be placed after Pred.
1931  // Succ should not be in any chain, or it is the head of some chain.
1932  if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1933  (!SuccChain || Succ == *SuccChain->begin())) {
1934  TopOK = false;
1935  break;
1936  }
1937  }
1938  if (TopOK) {
1939  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1940  MBPI->getEdgeProbability(Pred, Top);
1941  if (EdgeFreq > MaxFreq)
1942  MaxFreq = EdgeFreq;
1943  }
1944  }
1945  }
1946  return MaxFreq;
1947 }
1948 
1949 // Compute the fall through gains when move NewTop before OldTop.
1950 //
1951 // In following diagram, edges marked as "-" are reduced fallthrough, edges
1952 // marked as "+" are increased fallthrough, this function computes
1953 //
1954 // SUM(increased fallthrough) - SUM(decreased fallthrough)
1955 //
1956 // |
1957 // | -
1958 // V
1959 // --->OldTop
1960 // | .
1961 // | .
1962 // +| . +
1963 // | Pred --->
1964 // | |-
1965 // | V
1966 // --- NewTop <---
1967 // |-
1968 // V
1969 //
1971 MachineBlockPlacement::FallThroughGains(
1972  const MachineBasicBlock *NewTop,
1973  const MachineBasicBlock *OldTop,
1974  const MachineBasicBlock *ExitBB,
1975  const BlockFilterSet &LoopBlockSet) {
1976  BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
1977  BlockFrequency FallThrough2Exit = 0;
1978  if (ExitBB)
1979  FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
1980  MBPI->getEdgeProbability(NewTop, ExitBB);
1981  BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
1982  MBPI->getEdgeProbability(NewTop, OldTop);
1983 
1984  // Find the best Pred of NewTop.
1985  MachineBasicBlock *BestPred = nullptr;
1986  BlockFrequency FallThroughFromPred = 0;
1987  for (MachineBasicBlock *Pred : NewTop->predecessors()) {
1988  if (!LoopBlockSet.count(Pred))
1989  continue;
1990  BlockChain *PredChain = BlockToChain[Pred];
1991  if (!PredChain || Pred == *std::prev(PredChain->end())) {
1992  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1993  MBPI->getEdgeProbability(Pred, NewTop);
1994  if (EdgeFreq > FallThroughFromPred) {
1995  FallThroughFromPred = EdgeFreq;
1996  BestPred = Pred;
1997  }
1998  }
1999  }
2000 
2001  // If NewTop is not placed after Pred, another successor can be placed
2002  // after Pred.
2003  BlockFrequency NewFreq = 0;
2004  if (BestPred) {
2005  for (MachineBasicBlock *Succ : BestPred->successors()) {
2006  if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2007  continue;
2008  if (ComputedEdges.find(Succ) != ComputedEdges.end())
2009  continue;
2010  BlockChain *SuccChain = BlockToChain[Succ];
2011  if ((SuccChain && (Succ != *SuccChain->begin())) ||
2012  (SuccChain == BlockToChain[BestPred]))
2013  continue;
2014  BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2015  MBPI->getEdgeProbability(BestPred, Succ);
2016  if (EdgeFreq > NewFreq)
2017  NewFreq = EdgeFreq;
2018  }
2019  BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2020  MBPI->getEdgeProbability(BestPred, NewTop);
2021  if (NewFreq > OrigEdgeFreq) {
2022  // If NewTop is not the best successor of Pred, then Pred doesn't
2023  // fallthrough to NewTop. So there is no FallThroughFromPred and
2024  // NewFreq.
2025  NewFreq = 0;
2026  FallThroughFromPred = 0;
2027  }
2028  }
2029 
2030  BlockFrequency Result = 0;
2031  BlockFrequency Gains = BackEdgeFreq + NewFreq;
2032  BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
2033  FallThroughFromPred;
2034  if (Gains > Lost)
2035  Result = Gains - Lost;
2036  return Result;
2037 }
2038 
2039 /// Helper function of findBestLoopTop. Find the best loop top block
2040 /// from predecessors of old top.
2041 ///
2042 /// Look for a block which is strictly better than the old top for laying
2043 /// out before the old top of the loop. This looks for only two patterns:
2044 ///
2045 /// 1. a block has only one successor, the old loop top
2046 ///
2047 /// Because such a block will always result in an unconditional jump,
2048 /// rotating it in front of the old top is always profitable.
2049 ///
2050 /// 2. a block has two successors, one is old top, another is exit
2051 /// and it has more than one predecessors
2052 ///
2053 /// If it is below one of its predecessors P, only P can fall through to
2054 /// it, all other predecessors need a jump to it, and another conditional
2055 /// jump to loop header. If it is moved before loop header, all its
2056 /// predecessors jump to it, then fall through to loop header. So all its
2057 /// predecessors except P can reduce one taken branch.
2058 /// At the same time, move it before old top increases the taken branch
2059 /// to loop exit block, so the reduced taken branch will be compared with
2060 /// the increased taken branch to the loop exit block.
2062 MachineBlockPlacement::findBestLoopTopHelper(
2063  MachineBasicBlock *OldTop,
2064  const MachineLoop &L,
2065  const BlockFilterSet &LoopBlockSet) {
2066  // Check that the header hasn't been fused with a preheader block due to
2067  // crazy branches. If it has, we need to start with the header at the top to
2068  // prevent pulling the preheader into the loop body.
2069  BlockChain &HeaderChain = *BlockToChain[OldTop];
2070  if (!LoopBlockSet.count(*HeaderChain.begin()))
2071  return OldTop;
2072  if (OldTop != *HeaderChain.begin())
2073  return OldTop;
2074 
2075  LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2076  << "\n");
2077 
2078  BlockFrequency BestGains = 0;
2079  MachineBasicBlock *BestPred = nullptr;
2080  for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2081  if (!LoopBlockSet.count(Pred))
2082  continue;
2083  if (Pred == L.getHeader())
2084  continue;
2085  LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2086  << Pred->succ_size() << " successors, ";
2087  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
2088  if (Pred->succ_size() > 2)
2089  continue;
2090 
2091  MachineBasicBlock *OtherBB = nullptr;
2092  if (Pred->succ_size() == 2) {
2093  OtherBB = *Pred->succ_begin();
2094  if (OtherBB == OldTop)
2095  OtherBB = *Pred->succ_rbegin();
2096  }
2097 
2098  if (!canMoveBottomBlockToTop(Pred, OldTop))
2099  continue;
2100 
2101  BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
2102  LoopBlockSet);
2103  if ((Gains > 0) && (Gains > BestGains ||
2104  ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2105  BestPred = Pred;
2106  BestGains = Gains;
2107  }
2108  }
2109 
2110  // If no direct predecessor is fine, just use the loop header.
2111  if (!BestPred) {
2112  LLVM_DEBUG(dbgs() << " final top unchanged\n");
2113  return OldTop;
2114  }
2115 
2116  // Walk backwards through any straight line of predecessors.
2117  while (BestPred->pred_size() == 1 &&
2118  (*BestPred->pred_begin())->succ_size() == 1 &&
2119  *BestPred->pred_begin() != L.getHeader())
2120  BestPred = *BestPred->pred_begin();
2121 
2122  LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2123  return BestPred;
2124 }
2125 
2126 /// Find the best loop top block for layout.
2127 ///
2128 /// This function iteratively calls findBestLoopTopHelper, until no new better
2129 /// BB can be found.
2131 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2132  const BlockFilterSet &LoopBlockSet) {
2133  // Placing the latch block before the header may introduce an extra branch
2134  // that skips this block the first time the loop is executed, which we want
2135  // to avoid when optimising for size.
2136  // FIXME: in theory there is a case that does not introduce a new branch,
2137  // i.e. when the layout predecessor does not fallthrough to the loop header.
2138  // In practice this never happens though: there always seems to be a preheader
2139  // that can fallthrough and that is also placed before the header.
2140  bool OptForSize = F->getFunction().hasOptSize() ||
2141  llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());
2142  if (OptForSize)
2143  return L.getHeader();
2144 
2145  MachineBasicBlock *OldTop = nullptr;
2146  MachineBasicBlock *NewTop = L.getHeader();
2147  while (NewTop != OldTop) {
2148  OldTop = NewTop;
2149  NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2150  if (NewTop != OldTop)
2151  ComputedEdges[NewTop] = { OldTop, false };
2152  }
2153  return NewTop;
2154 }
2155 
2156 /// Find the best loop exiting block for layout.
2157 ///
2158 /// This routine implements the logic to analyze the loop looking for the best
2159 /// block to layout at the top of the loop. Typically this is done to maximize
2160 /// fallthrough opportunities.
2162 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2163  const BlockFilterSet &LoopBlockSet,
2164  BlockFrequency &ExitFreq) {
2165  // We don't want to layout the loop linearly in all cases. If the loop header
2166  // is just a normal basic block in the loop, we want to look for what block
2167  // within the loop is the best one to layout at the top. However, if the loop
2168  // header has be pre-merged into a chain due to predecessors not having
2169  // analyzable branches, *and* the predecessor it is merged with is *not* part
2170  // of the loop, rotating the header into the middle of the loop will create
2171  // a non-contiguous range of blocks which is Very Bad. So start with the
2172  // header and only rotate if safe.
2173  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2174  if (!LoopBlockSet.count(*HeaderChain.begin()))
2175  return nullptr;
2176 
2177  BlockFrequency BestExitEdgeFreq;
2178  unsigned BestExitLoopDepth = 0;
2179  MachineBasicBlock *ExitingBB = nullptr;
2180  // If there are exits to outer loops, loop rotation can severely limit
2181  // fallthrough opportunities unless it selects such an exit. Keep a set of
2182  // blocks where rotating to exit with that block will reach an outer loop.
2183  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2184 
2185  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2186  << getBlockName(L.getHeader()) << "\n");
2187  for (MachineBasicBlock *MBB : L.getBlocks()) {
2188  BlockChain &Chain = *BlockToChain[MBB];
2189  // Ensure that this block is at the end of a chain; otherwise it could be
2190  // mid-way through an inner loop or a successor of an unanalyzable branch.
2191  if (MBB != *std::prev(Chain.end()))
2192  continue;
2193 
2194  // Now walk the successors. We need to establish whether this has a viable
2195  // exiting successor and whether it has a viable non-exiting successor.
2196  // We store the old exiting state and restore it if a viable looping
2197  // successor isn't found.
2198  MachineBasicBlock *OldExitingBB = ExitingBB;
2199  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2200  bool HasLoopingSucc = false;
2201  for (MachineBasicBlock *Succ : MBB->successors()) {
2202  if (Succ->isEHPad())
2203  continue;
2204  if (Succ == MBB)
2205  continue;
2206  BlockChain &SuccChain = *BlockToChain[Succ];
2207  // Don't split chains, either this chain or the successor's chain.
2208  if (&Chain == &SuccChain) {
2209  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2210  << getBlockName(Succ) << " (chain conflict)\n");
2211  continue;
2212  }
2213 
2214  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2215  if (LoopBlockSet.count(Succ)) {
2216  LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2217  << getBlockName(Succ) << " (" << SuccProb << ")\n");
2218  HasLoopingSucc = true;
2219  continue;
2220  }
2221 
2222  unsigned SuccLoopDepth = 0;
2223  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2224  SuccLoopDepth = ExitLoop->getLoopDepth();
2225  if (ExitLoop->contains(&L))
2226  BlocksExitingToOuterLoop.insert(MBB);
2227  }
2228 
2229  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2230  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2231  << getBlockName(Succ) << " [L:" << SuccLoopDepth
2232  << "] (";
2233  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
2234  // Note that we bias this toward an existing layout successor to retain
2235  // incoming order in the absence of better information. The exit must have
2236  // a frequency higher than the current exit before we consider breaking
2237  // the layout.
2238  BranchProbability Bias(100 - ExitBlockBias, 100);
2239  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2240  ExitEdgeFreq > BestExitEdgeFreq ||
2241  (MBB->isLayoutSuccessor(Succ) &&
2242  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2243  BestExitEdgeFreq = ExitEdgeFreq;
2244  ExitingBB = MBB;
2245  }
2246  }
2247 
2248  if (!HasLoopingSucc) {
2249  // Restore the old exiting state, no viable looping successor was found.
2250  ExitingBB = OldExitingBB;
2251  BestExitEdgeFreq = OldBestExitEdgeFreq;
2252  }
2253  }
2254  // Without a candidate exiting block or with only a single block in the
2255  // loop, just use the loop header to layout the loop.
2256  if (!ExitingBB) {
2257  LLVM_DEBUG(
2258  dbgs() << " No other candidate exit blocks, using loop header\n");
2259  return nullptr;
2260  }
2261  if (L.getNumBlocks() == 1) {
2262  LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2263  return nullptr;
2264  }
2265 
2266  // Also, if we have exit blocks which lead to outer loops but didn't select
2267  // one of them as the exiting block we are rotating toward, disable loop
2268  // rotation altogether.
2269  if (!BlocksExitingToOuterLoop.empty() &&
2270  !BlocksExitingToOuterLoop.count(ExitingBB))
2271  return nullptr;
2272 
2273  LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2274  << "\n");
2275  ExitFreq = BestExitEdgeFreq;
2276  return ExitingBB;
2277 }
2278 
2279 /// Check if there is a fallthrough to loop header Top.
2280 ///
2281 /// 1. Look for a Pred that can be layout before Top.
2282 /// 2. Check if Top is the most possible successor of Pred.
2283 bool
2284 MachineBlockPlacement::hasViableTopFallthrough(
2285  const MachineBasicBlock *Top,
2286  const BlockFilterSet &LoopBlockSet) {
2287  for (MachineBasicBlock *Pred : Top->predecessors()) {
2288  BlockChain *PredChain = BlockToChain[Pred];
2289  if (!LoopBlockSet.count(Pred) &&
2290  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2291  // Found a Pred block can be placed before Top.
2292  // Check if Top is the best successor of Pred.
2293  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2294  bool TopOK = true;
2295  for (MachineBasicBlock *Succ : Pred->successors()) {
2296  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2297  BlockChain *SuccChain = BlockToChain[Succ];
2298  // Check if Succ can be placed after Pred.
2299  // Succ should not be in any chain, or it is the head of some chain.
2300  if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2301  TopOK = false;
2302  break;
2303  }
2304  }
2305  if (TopOK)
2306  return true;
2307  }
2308  }
2309  return false;
2310 }
2311 
2312 /// Attempt to rotate an exiting block to the bottom of the loop.
2313 ///
2314 /// Once we have built a chain, try to rotate it to line up the hot exit block
2315 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2316 /// branches. For example, if the loop has fallthrough into its header and out
2317 /// of its bottom already, don't rotate it.
2318 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2319  const MachineBasicBlock *ExitingBB,
2320  BlockFrequency ExitFreq,
2321  const BlockFilterSet &LoopBlockSet) {
2322  if (!ExitingBB)
2323  return;
2324 
2325  MachineBasicBlock *Top = *LoopChain.begin();
2326  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2327 
2328  // If ExitingBB is already the last one in a chain then nothing to do.
2329  if (Bottom == ExitingBB)
2330  return;
2331 
2332  // The entry block should always be the first BB in a function.
2333  if (Top->isEntryBlock())
2334  return;
2335 
2336  bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2337 
2338  // If the header has viable fallthrough, check whether the current loop
2339  // bottom is a viable exiting block. If so, bail out as rotating will
2340  // introduce an unnecessary branch.
2341  if (ViableTopFallthrough) {
2342  for (MachineBasicBlock *Succ : Bottom->successors()) {
2343  BlockChain *SuccChain = BlockToChain[Succ];
2344  if (!LoopBlockSet.count(Succ) &&
2345  (!SuccChain || Succ == *SuccChain->begin()))
2346  return;
2347  }
2348 
2349  // Rotate will destroy the top fallthrough, we need to ensure the new exit
2350  // frequency is larger than top fallthrough.
2351  BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2352  if (FallThrough2Top >= ExitFreq)
2353  return;
2354  }
2355 
2356  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2357  if (ExitIt == LoopChain.end())
2358  return;
2359 
2360  // Rotating a loop exit to the bottom when there is a fallthrough to top
2361  // trades the entry fallthrough for an exit fallthrough.
2362  // If there is no bottom->top edge, but the chosen exit block does have
2363  // a fallthrough, we break that fallthrough for nothing in return.
2364 
2365  // Let's consider an example. We have a built chain of basic blocks
2366  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2367  // By doing a rotation we get
2368  // Bk+1, ..., Bn, B1, ..., Bk
2369  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2370  // If we had a fallthrough Bk -> Bk+1 it is broken now.
2371  // It might be compensated by fallthrough Bn -> B1.
2372  // So we have a condition to avoid creation of extra branch by loop rotation.
2373  // All below must be true to avoid loop rotation:
2374  // If there is a fallthrough to top (B1)
2375  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2376  // There is no fallthrough from bottom (Bn) to top (B1).
2377  // Please note that there is no exit fallthrough from Bn because we checked it
2378  // above.
2379  if (ViableTopFallthrough) {
2380  assert(std::next(ExitIt) != LoopChain.end() &&
2381  "Exit should not be last BB");
2382  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2383  if (ExitingBB->isSuccessor(NextBlockInChain))
2384  if (!Bottom->isSuccessor(Top))
2385  return;
2386  }
2387 
2388  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2389  << " at bottom\n");
2390  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2391 }
2392 
2393 /// Attempt to rotate a loop based on profile data to reduce branch cost.
2394 ///
2395 /// With profile data, we can determine the cost in terms of missed fall through
2396 /// opportunities when rotating a loop chain and select the best rotation.
2397 /// Basically, there are three kinds of cost to consider for each rotation:
2398 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2399 /// the loop to the loop header.
2400 /// 2. The possibly missed fall through edges (if they exist) from the loop
2401 /// exits to BB out of the loop.
2402 /// 3. The missed fall through edge (if it exists) from the last BB to the
2403 /// first BB in the loop chain.
2404 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2405 /// We select the best rotation with the smallest cost.
2406 void MachineBlockPlacement::rotateLoopWithProfile(
2407  BlockChain &LoopChain, const MachineLoop &L,
2408  const BlockFilterSet &LoopBlockSet) {
2409  auto RotationPos = LoopChain.end();
2410  MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2411 
2412  // The entry block should always be the first BB in a function.
2413  if (ChainHeaderBB->isEntryBlock())
2414  return;
2415 
2416  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2417 
2418  // A utility lambda that scales up a block frequency by dividing it by a
2419  // branch probability which is the reciprocal of the scale.
2420  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2421  unsigned Scale) -> BlockFrequency {
2422  if (Scale == 0)
2423  return 0;
2424  // Use operator / between BlockFrequency and BranchProbability to implement
2425  // saturating multiplication.
2426  return Freq / BranchProbability(1, Scale);
2427  };
2428 
2429  // Compute the cost of the missed fall-through edge to the loop header if the
2430  // chain head is not the loop header. As we only consider natural loops with
2431  // single header, this computation can be done only once.
2432  BlockFrequency HeaderFallThroughCost(0);
2433  for (auto *Pred : ChainHeaderBB->predecessors()) {
2434  BlockChain *PredChain = BlockToChain[Pred];
2435  if (!LoopBlockSet.count(Pred) &&
2436  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2437  auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2438  MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2439  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2440  // If the predecessor has only an unconditional jump to the header, we
2441  // need to consider the cost of this jump.
2442  if (Pred->succ_size() == 1)
2443  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2444  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2445  }
2446  }
2447 
2448  // Here we collect all exit blocks in the loop, and for each exit we find out
2449  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2450  // as the sum of frequencies of exit edges we collect here, excluding the exit
2451  // edge from the tail of the loop chain.
2453  for (auto BB : LoopChain) {
2454  auto LargestExitEdgeProb = BranchProbability::getZero();
2455  for (auto *Succ : BB->successors()) {
2456  BlockChain *SuccChain = BlockToChain[Succ];
2457  if (!LoopBlockSet.count(Succ) &&
2458  (!SuccChain || Succ == *SuccChain->begin())) {
2459  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2460  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2461  }
2462  }
2463  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2464  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2465  ExitsWithFreq.emplace_back(BB, ExitFreq);
2466  }
2467  }
2468 
2469  // In this loop we iterate every block in the loop chain and calculate the
2470  // cost assuming the block is the head of the loop chain. When the loop ends,
2471  // we should have found the best candidate as the loop chain's head.
2472  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2473  EndIter = LoopChain.end();
2474  Iter != EndIter; Iter++, TailIter++) {
2475  // TailIter is used to track the tail of the loop chain if the block we are
2476  // checking (pointed by Iter) is the head of the chain.
2477  if (TailIter == LoopChain.end())
2478  TailIter = LoopChain.begin();
2479 
2480  auto TailBB = *TailIter;
2481 
2482  // Calculate the cost by putting this BB to the top.
2483  BlockFrequency Cost = 0;
2484 
2485  // If the current BB is the loop header, we need to take into account the
2486  // cost of the missed fall through edge from outside of the loop to the
2487  // header.
2488  if (Iter != LoopChain.begin())
2489  Cost += HeaderFallThroughCost;
2490 
2491  // Collect the loop exit cost by summing up frequencies of all exit edges
2492  // except the one from the chain tail.
2493  for (auto &ExitWithFreq : ExitsWithFreq)
2494  if (TailBB != ExitWithFreq.first)
2495  Cost += ExitWithFreq.second;
2496 
2497  // The cost of breaking the once fall-through edge from the tail to the top
2498  // of the loop chain. Here we need to consider three cases:
2499  // 1. If the tail node has only one successor, then we will get an
2500  // additional jmp instruction. So the cost here is (MisfetchCost +
2501  // JumpInstCost) * tail node frequency.
2502  // 2. If the tail node has two successors, then we may still get an
2503  // additional jmp instruction if the layout successor after the loop
2504  // chain is not its CFG successor. Note that the more frequently executed
2505  // jmp instruction will be put ahead of the other one. Assume the
2506  // frequency of those two branches are x and y, where x is the frequency
2507  // of the edge to the chain head, then the cost will be
2508  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2509  // 3. If the tail node has more than two successors (this rarely happens),
2510  // we won't consider any additional cost.
2511  if (TailBB->isSuccessor(*Iter)) {
2512  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2513  if (TailBB->succ_size() == 1)
2514  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2516  else if (TailBB->succ_size() == 2) {
2517  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2518  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2519  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2520  ? TailBBFreq * TailToHeadProb.getCompl()
2521  : TailToHeadFreq;
2522  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2523  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2524  }
2525  }
2526 
2527  LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2528  << getBlockName(*Iter)
2529  << " to the top: " << Cost.getFrequency() << "\n");
2530 
2531  if (Cost < SmallestRotationCost) {
2532  SmallestRotationCost = Cost;
2533  RotationPos = Iter;
2534  }
2535  }
2536 
2537  if (RotationPos != LoopChain.end()) {
2538  LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2539  << " to the top\n");
2540  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2541  }
2542 }
2543 
2544 /// Collect blocks in the given loop that are to be placed.
2545 ///
2546 /// When profile data is available, exclude cold blocks from the returned set;
2547 /// otherwise, collect all blocks in the loop.
2549 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2550  BlockFilterSet LoopBlockSet;
2551 
2552  // Filter cold blocks off from LoopBlockSet when profile data is available.
2553  // Collect the sum of frequencies of incoming edges to the loop header from
2554  // outside. If we treat the loop as a super block, this is the frequency of
2555  // the loop. Then for each block in the loop, we calculate the ratio between
2556  // its frequency and the frequency of the loop block. When it is too small,
2557  // don't add it to the loop chain. If there are outer loops, then this block
2558  // will be merged into the first outer loop chain for which this block is not
2559  // cold anymore. This needs precise profile data and we only do this when
2560  // profile data is available.
2561  if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2562  BlockFrequency LoopFreq(0);
2563  for (auto LoopPred : L.getHeader()->predecessors())
2564  if (!L.contains(LoopPred))
2565  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2566  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2567 
2568  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2569  if (LoopBlockSet.count(LoopBB))
2570  continue;
2571  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2572  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2573  continue;
2574  BlockChain *Chain = BlockToChain[LoopBB];
2575  for (MachineBasicBlock *ChainBB : *Chain)
2576  LoopBlockSet.insert(ChainBB);
2577  }
2578  } else
2579  LoopBlockSet.insert(L.block_begin(), L.block_end());
2580 
2581  return LoopBlockSet;
2582 }
2583 
2584 /// Forms basic block chains from the natural loop structures.
2585 ///
2586 /// These chains are designed to preserve the existing *structure* of the code
2587 /// as much as possible. We can then stitch the chains together in a way which
2588 /// both preserves the topological structure and minimizes taken conditional
2589 /// branches.
2590 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2591  // First recurse through any nested loops, building chains for those inner
2592  // loops.
2593  for (const MachineLoop *InnerLoop : L)
2594  buildLoopChains(*InnerLoop);
2595 
2596  assert(BlockWorkList.empty() &&
2597  "BlockWorkList not empty when starting to build loop chains.");
2598  assert(EHPadWorkList.empty() &&
2599  "EHPadWorkList not empty when starting to build loop chains.");
2600  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2601 
2602  // Check if we have profile data for this function. If yes, we will rotate
2603  // this loop by modeling costs more precisely which requires the profile data
2604  // for better layout.
2605  bool RotateLoopWithProfile =
2607  (PreciseRotationCost && F->getFunction().hasProfileData());
2608 
2609  // First check to see if there is an obviously preferable top block for the
2610  // loop. This will default to the header, but may end up as one of the
2611  // predecessors to the header if there is one which will result in strictly
2612  // fewer branches in the loop body.
2613  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2614 
2615  // If we selected just the header for the loop top, look for a potentially
2616  // profitable exit block in the event that rotating the loop can eliminate
2617  // branches by placing an exit edge at the bottom.
2618  //
2619  // Loops are processed innermost to uttermost, make sure we clear
2620  // PreferredLoopExit before processing a new loop.
2621  PreferredLoopExit = nullptr;
2622  BlockFrequency ExitFreq;
2623  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2624  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2625 
2626  BlockChain &LoopChain = *BlockToChain[LoopTop];
2627 
2628  // FIXME: This is a really lame way of walking the chains in the loop: we
2629  // walk the blocks, and use a set to prevent visiting a particular chain
2630  // twice.
2631  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2632  assert(LoopChain.UnscheduledPredecessors == 0 &&
2633  "LoopChain should not have unscheduled predecessors.");
2634  UpdatedPreds.insert(&LoopChain);
2635 
2636  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2637  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2638 
2639  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2640 
2641  if (RotateLoopWithProfile)
2642  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2643  else
2644  rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2645 
2646  LLVM_DEBUG({
2647  // Crash at the end so we get all of the debugging output first.
2648  bool BadLoop = false;
2649  if (LoopChain.UnscheduledPredecessors) {
2650  BadLoop = true;
2651  dbgs() << "Loop chain contains a block without its preds placed!\n"
2652  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2653  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2654  }
2655  for (MachineBasicBlock *ChainBB : LoopChain) {
2656  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2657  if (!LoopBlockSet.remove(ChainBB)) {
2658  // We don't mark the loop as bad here because there are real situations
2659  // where this can occur. For example, with an unanalyzable fallthrough
2660  // from a loop block to a non-loop block or vice versa.
2661  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2662  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2663  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2664  << " Bad block: " << getBlockName(ChainBB) << "\n";
2665  }
2666  }
2667 
2668  if (!LoopBlockSet.empty()) {
2669  BadLoop = true;
2670  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2671  dbgs() << "Loop contains blocks never placed into a chain!\n"
2672  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2673  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2674  << " Bad block: " << getBlockName(LoopBB) << "\n";
2675  }
2676  assert(!BadLoop && "Detected problems with the placement of this loop.");
2677  });
2678 
2679  BlockWorkList.clear();
2680  EHPadWorkList.clear();
2681 }
2682 
2683 void MachineBlockPlacement::buildCFGChains() {
2684  // Ensure that every BB in the function has an associated chain to simplify
2685  // the assumptions of the remaining algorithm.
2686  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2687  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2688  ++FI) {
2689  MachineBasicBlock *BB = &*FI;
2690  BlockChain *Chain =
2691  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2692  // Also, merge any blocks which we cannot reason about and must preserve
2693  // the exact fallthrough behavior for.
2694  while (true) {
2695  Cond.clear();
2696  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2697  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2698  break;
2699 
2700  MachineFunction::iterator NextFI = std::next(FI);
2701  MachineBasicBlock *NextBB = &*NextFI;
2702  // Ensure that the layout successor is a viable block, as we know that
2703  // fallthrough is a possibility.
2704  assert(NextFI != FE && "Can't fallthrough past the last block.");
2705  LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2706  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2707  << "\n");
2708  Chain->merge(NextBB, nullptr);
2709 #ifndef NDEBUG
2710  BlocksWithUnanalyzableExits.insert(&*BB);
2711 #endif
2712  FI = NextFI;
2713  BB = NextBB;
2714  }
2715  }
2716 
2717  // Build any loop-based chains.
2718  PreferredLoopExit = nullptr;
2719  for (MachineLoop *L : *MLI)
2720  buildLoopChains(*L);
2721 
2722  assert(BlockWorkList.empty() &&
2723  "BlockWorkList should be empty before building final chain.");
2724  assert(EHPadWorkList.empty() &&
2725  "EHPadWorkList should be empty before building final chain.");
2726 
2727  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2728  for (MachineBasicBlock &MBB : *F)
2729  fillWorkLists(&MBB, UpdatedPreds);
2730 
2731  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2732  buildChain(&F->front(), FunctionChain);
2733 
2734 #ifndef NDEBUG
2735  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2736 #endif
2737  LLVM_DEBUG({
2738  // Crash at the end so we get all of the debugging output first.
2739  bool BadFunc = false;
2740  FunctionBlockSetType FunctionBlockSet;
2741  for (MachineBasicBlock &MBB : *F)
2742  FunctionBlockSet.insert(&MBB);
2743 
2744  for (MachineBasicBlock *ChainBB : FunctionChain)
2745  if (!FunctionBlockSet.erase(ChainBB)) {
2746  BadFunc = true;
2747  dbgs() << "Function chain contains a block not in the function!\n"
2748  << " Bad block: " << getBlockName(ChainBB) << "\n";
2749  }
2750 
2751  if (!FunctionBlockSet.empty()) {
2752  BadFunc = true;
2753  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2754  dbgs() << "Function contains blocks never placed into a chain!\n"
2755  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2756  }
2757  assert(!BadFunc && "Detected problems with the block placement.");
2758  });
2759 
2760  // Remember original layout ordering, so we can update terminators after
2761  // reordering to point to the original layout successor.
2762  SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2763  F->getNumBlockIDs());
2764  {
2765  MachineBasicBlock *LastMBB = nullptr;
2766  for (auto &MBB : *F) {
2767  if (LastMBB != nullptr)
2768  OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2769  LastMBB = &MBB;
2770  }
2771  OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2772  }
2773 
2774  // Splice the blocks into place.
2775  MachineFunction::iterator InsertPos = F->begin();
2776  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2777  for (MachineBasicBlock *ChainBB : FunctionChain) {
2778  LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2779  : " ... ")
2780  << getBlockName(ChainBB) << "\n");
2781  if (InsertPos != MachineFunction::iterator(ChainBB))
2782  F->splice(InsertPos, ChainBB);
2783  else
2784  ++InsertPos;
2785 
2786  // Update the terminator of the previous block.
2787  if (ChainBB == *FunctionChain.begin())
2788  continue;
2789  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2790 
2791  // FIXME: It would be awesome of updateTerminator would just return rather
2792  // than assert when the branch cannot be analyzed in order to remove this
2793  // boiler plate.
2794  Cond.clear();
2795  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2796 
2797 #ifndef NDEBUG
2798  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2799  // Given the exact block placement we chose, we may actually not _need_ to
2800  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2801  // do that at this point is a bug.
2802  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2803  !PrevBB->canFallThrough()) &&
2804  "Unexpected block with un-analyzable fallthrough!");
2805  Cond.clear();
2806  TBB = FBB = nullptr;
2807  }
2808 #endif
2809 
2810  // The "PrevBB" is not yet updated to reflect current code layout, so,
2811  // o. it may fall-through to a block without explicit "goto" instruction
2812  // before layout, and no longer fall-through it after layout; or
2813  // o. just opposite.
2814  //
2815  // analyzeBranch() may return erroneous value for FBB when these two
2816  // situations take place. For the first scenario FBB is mistakenly set NULL;
2817  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2818  // mistakenly pointing to "*BI".
2819  // Thus, if the future change needs to use FBB before the layout is set, it
2820  // has to correct FBB first by using the code similar to the following:
2821  //
2822  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2823  // PrevBB->updateTerminator();
2824  // Cond.clear();
2825  // TBB = FBB = nullptr;
2826  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2827  // // FIXME: This should never take place.
2828  // TBB = FBB = nullptr;
2829  // }
2830  // }
2831  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2832  PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2833  }
2834  }
2835 
2836  // Fixup the last block.
2837  Cond.clear();
2838  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2839  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2840  MachineBasicBlock *PrevBB = &F->back();
2841  PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2842  }
2843 
2844  BlockWorkList.clear();
2845  EHPadWorkList.clear();
2846 }
2847 
2848 void MachineBlockPlacement::optimizeBranches() {
2849  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2850  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2851 
2852  // Now that all the basic blocks in the chain have the proper layout,
2853  // make a final call to analyzeBranch with AllowModify set.
2854  // Indeed, the target may be able to optimize the branches in a way we
2855  // cannot because all branches may not be analyzable.
2856  // E.g., the target may be able to remove an unconditional branch to
2857  // a fallthrough when it occurs after predicated terminators.
2858  for (MachineBasicBlock *ChainBB : FunctionChain) {
2859  Cond.clear();
2860  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2861  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2862  // If PrevBB has a two-way branch, try to re-order the branches
2863  // such that we branch to the successor with higher probability first.
2864  if (TBB && !Cond.empty() && FBB &&
2865  MBPI->getEdgeProbability(ChainBB, FBB) >
2866  MBPI->getEdgeProbability(ChainBB, TBB) &&
2868  LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2869  << getBlockName(ChainBB) << "\n");
2870  LLVM_DEBUG(dbgs() << " Edge probability: "
2871  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2872  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2873  DebugLoc dl; // FIXME: this is nowhere
2874  TII->removeBranch(*ChainBB);
2875  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2876  }
2877  }
2878  }
2879 }
2880 
2881 void MachineBlockPlacement::alignBlocks() {
2882  // Walk through the backedges of the function now that we have fully laid out
2883  // the basic blocks and align the destination of each backedge. We don't rely
2884  // exclusively on the loop info here so that we can align backedges in
2885  // unnatural CFGs and backedges that were introduced purely because of the
2886  // loop rotations done during this layout pass.
2887  if (F->getFunction().hasMinSize() ||
2888  (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2889  return;
2890  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2891  if (FunctionChain.begin() == FunctionChain.end())
2892  return; // Empty chain.
2893 
2894  const BranchProbability ColdProb(1, 5); // 20%
2895  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2896  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2897  for (MachineBasicBlock *ChainBB : FunctionChain) {
2898  if (ChainBB == *FunctionChain.begin())
2899  continue;
2900 
2901  // Don't align non-looping basic blocks. These are unlikely to execute
2902  // enough times to matter in practice. Note that we'll still handle
2903  // unnatural CFGs inside of a natural outer loop (the common case) and
2904  // rotated loops.
2905  MachineLoop *L = MLI->getLoopFor(ChainBB);
2906  if (!L)
2907  continue;
2908 
2909  const Align Align = TLI->getPrefLoopAlignment(L);
2910  if (Align == 1)
2911  continue; // Don't care about loop alignment.
2912 
2913  // If the block is cold relative to the function entry don't waste space
2914  // aligning it.
2915  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2916  if (Freq < WeightedEntryFreq)
2917  continue;
2918 
2919  // If the block is cold relative to its loop header, don't align it
2920  // regardless of what edges into the block exist.
2921  MachineBasicBlock *LoopHeader = L->getHeader();
2922  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2923  if (Freq < (LoopHeaderFreq * ColdProb))
2924  continue;
2925 
2926  // If the global profiles indicates so, don't align it.
2927  if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
2928  !TLI->alignLoopsWithOptSize())
2929  continue;
2930 
2931  // Check for the existence of a non-layout predecessor which would benefit
2932  // from aligning this block.
2933  MachineBasicBlock *LayoutPred =
2934  &*std::prev(MachineFunction::iterator(ChainBB));
2935 
2936  auto DetermineMaxAlignmentPadding = [&]() {
2937  // Set the maximum bytes allowed to be emitted for alignment.
2938  unsigned MaxBytes;
2939  if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
2940  MaxBytes = MaxBytesForAlignmentOverride;
2941  else
2942  MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
2943  ChainBB->setMaxBytesForAlignment(MaxBytes);
2944  };
2945 
2946  // Force alignment if all the predecessors are jumps. We already checked
2947  // that the block isn't cold above.
2948  if (!LayoutPred->isSuccessor(ChainBB)) {
2949  ChainBB->setAlignment(Align);
2950  DetermineMaxAlignmentPadding();
2951  continue;
2952  }
2953 
2954  // Align this block if the layout predecessor's edge into this block is
2955  // cold relative to the block. When this is true, other predecessors make up
2956  // all of the hot entries into the block and thus alignment is likely to be
2957  // important.
2958  BranchProbability LayoutProb =
2959  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2960  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2961  if (LayoutEdgeFreq <= (Freq * ColdProb)) {
2962  ChainBB->setAlignment(Align);
2963  DetermineMaxAlignmentPadding();
2964  }
2965  }
2966 }
2967 
2968 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2969 /// it was duplicated into its chain predecessor and removed.
2970 /// \p BB - Basic block that may be duplicated.
2971 ///
2972 /// \p LPred - Chosen layout predecessor of \p BB.
2973 /// Updated to be the chain end if LPred is removed.
2974 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2975 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2976 /// Used to identify which blocks to update predecessor
2977 /// counts.
2978 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2979 /// chosen in the given order due to unnatural CFG
2980 /// only needed if \p BB is removed and
2981 /// \p PrevUnplacedBlockIt pointed to \p BB.
2982 /// @return true if \p BB was removed.
2983 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2985  const MachineBasicBlock *LoopHeaderBB,
2986  BlockChain &Chain, BlockFilterSet *BlockFilter,
2987  MachineFunction::iterator &PrevUnplacedBlockIt) {
2988  bool Removed, DuplicatedToLPred;
2989  bool DuplicatedToOriginalLPred;
2990  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
2991  PrevUnplacedBlockIt,
2992  DuplicatedToLPred);
2993  if (!Removed)
2994  return false;
2995  DuplicatedToOriginalLPred = DuplicatedToLPred;
2996  // Iteratively try to duplicate again. It can happen that a block that is
2997  // duplicated into is still small enough to be duplicated again.
2998  // No need to call markBlockSuccessors in this case, as the blocks being
2999  // duplicated from here on are already scheduled.
3000  while (DuplicatedToLPred && Removed) {
3001  MachineBasicBlock *DupBB, *DupPred;
3002  // The removal callback causes Chain.end() to be updated when a block is
3003  // removed. On the first pass through the loop, the chain end should be the
3004  // same as it was on function entry. On subsequent passes, because we are
3005  // duplicating the block at the end of the chain, if it is removed the
3006  // chain will have shrunk by one block.
3007  BlockChain::iterator ChainEnd = Chain.end();
3008  DupBB = *(--ChainEnd);
3009  // Now try to duplicate again.
3010  if (ChainEnd == Chain.begin())
3011  break;
3012  DupPred = *std::prev(ChainEnd);
3013  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
3014  PrevUnplacedBlockIt,
3015  DuplicatedToLPred);
3016  }
3017  // If BB was duplicated into LPred, it is now scheduled. But because it was
3018  // removed, markChainSuccessors won't be called for its chain. Instead we
3019  // call markBlockSuccessors for LPred to achieve the same effect. This must go
3020  // at the end because repeating the tail duplication can increase the number
3021  // of unscheduled predecessors.
3022  LPred = *std::prev(Chain.end());
3023  if (DuplicatedToOriginalLPred)
3024  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3025  return true;
3026 }
3027 
3028 /// Tail duplicate \p BB into (some) predecessors if profitable.
3029 /// \p BB - Basic block that may be duplicated
3030 /// \p LPred - Chosen layout predecessor of \p BB
3031 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3032 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3033 /// Used to identify which blocks to update predecessor
3034 /// counts.
3035 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3036 /// chosen in the given order due to unnatural CFG
3037 /// only needed if \p BB is removed and
3038 /// \p PrevUnplacedBlockIt pointed to \p BB.
3039 /// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3040 /// \return - True if the block was duplicated into all preds and removed.
3041 bool MachineBlockPlacement::maybeTailDuplicateBlock(
3043  BlockChain &Chain, BlockFilterSet *BlockFilter,
3044  MachineFunction::iterator &PrevUnplacedBlockIt,
3045  bool &DuplicatedToLPred) {
3046  DuplicatedToLPred = false;
3047  if (!shouldTailDuplicate(BB))
3048  return false;
3049 
3050  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3051  << "\n");
3052 
3053  // This has to be a callback because none of it can be done after
3054  // BB is deleted.
3055  bool Removed = false;
3056  auto RemovalCallback =
3057  [&](MachineBasicBlock *RemBB) {
3058  // Signal to outer function
3059  Removed = true;
3060 
3061  // Conservative default.
3062  bool InWorkList = true;
3063  // Remove from the Chain and Chain Map
3064  if (BlockToChain.count(RemBB)) {
3065  BlockChain *Chain = BlockToChain[RemBB];
3066  InWorkList = Chain->UnscheduledPredecessors == 0;
3067  Chain->remove(RemBB);
3068  BlockToChain.erase(RemBB);
3069  }
3070 
3071  // Handle the unplaced block iterator
3072  if (&(*PrevUnplacedBlockIt) == RemBB) {
3073  PrevUnplacedBlockIt++;
3074  }
3075 
3076  // Handle the Work Lists
3077  if (InWorkList) {
3078  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
3079  if (RemBB->isEHPad())
3080  RemoveList = EHPadWorkList;
3081  llvm::erase_value(RemoveList, RemBB);
3082  }
3083 
3084  // Handle the filter set
3085  if (BlockFilter) {
3086  BlockFilter->remove(RemBB);
3087  }
3088 
3089  // Remove the block from loop info.
3090  MLI->removeBlock(RemBB);
3091  if (RemBB == PreferredLoopExit)
3092  PreferredLoopExit = nullptr;
3093 
3094  LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
3095  << getBlockName(RemBB) << "\n");
3096  };
3097  auto RemovalCallbackRef =
3098  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
3099 
3100  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
3101  bool IsSimple = TailDup.isSimpleBB(BB);
3102  SmallVector<MachineBasicBlock *, 8> CandidatePreds;
3103  SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3104  if (F->getFunction().hasProfileData()) {
3105  // We can do partial duplication with precise profile information.
3106  findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3107  if (CandidatePreds.size() == 0)
3108  return false;
3109  if (CandidatePreds.size() < BB->pred_size())
3110  CandidatePtr = &CandidatePreds;
3111  }
3112  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3113  &RemovalCallbackRef, CandidatePtr);
3114 
3115  // Update UnscheduledPredecessors to reflect tail-duplication.
3116  DuplicatedToLPred = false;
3117  for (MachineBasicBlock *Pred : DuplicatedPreds) {
3118  // We're only looking for unscheduled predecessors that match the filter.
3119  BlockChain* PredChain = BlockToChain[Pred];
3120  if (Pred == LPred)
3121  DuplicatedToLPred = true;
3122  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
3123  || PredChain == &Chain)
3124  continue;
3125  for (MachineBasicBlock *NewSucc : Pred->successors()) {
3126  if (BlockFilter && !BlockFilter->count(NewSucc))
3127  continue;
3128  BlockChain *NewChain = BlockToChain[NewSucc];
3129  if (NewChain != &Chain && NewChain != PredChain)
3130  NewChain->UnscheduledPredecessors++;
3131  }
3132  }
3133  return Removed;
3134 }
3135 
3136 // Count the number of actual machine instructions.
3138  uint64_t InstrCount = 0;
3139  for (MachineInstr &MI : *MBB) {
3140  if (!MI.isPHI() && !MI.isMetaInstruction())
3141  InstrCount += 1;
3142  }
3143  return InstrCount;
3144 }
3145 
3146 // The size cost of duplication is the instruction size of the duplicated block.
3147 // So we should scale the threshold accordingly. But the instruction size is not
3148 // available on all targets, so we use the number of instructions instead.
3149 BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3150  return DupThreshold.getFrequency() * countMBBInstruction(BB);
3151 }
3152 
3153 // Returns true if BB is Pred's best successor.
3154 bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3155  MachineBasicBlock *Pred,
3156  BlockFilterSet *BlockFilter) {
3157  if (BB == Pred)
3158  return false;
3159  if (BlockFilter && !BlockFilter->count(Pred))
3160  return false;
3161  BlockChain *PredChain = BlockToChain[Pred];
3162  if (PredChain && (Pred != *std::prev(PredChain->end())))
3163  return false;
3164 
3165  // Find the successor with largest probability excluding BB.
3167  for (MachineBasicBlock *Succ : Pred->successors())
3168  if (Succ != BB) {
3169  if (BlockFilter && !BlockFilter->count(Succ))
3170  continue;
3171  BlockChain *SuccChain = BlockToChain[Succ];
3172  if (SuccChain && (Succ != *SuccChain->begin()))
3173  continue;
3174  BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3175  if (SuccProb > BestProb)
3176  BestProb = SuccProb;
3177  }
3178 
3179  BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3180  if (BBProb <= BestProb)
3181  return false;
3182 
3183  // Compute the number of reduced taken branches if Pred falls through to BB
3184  // instead of another successor. Then compare it with threshold.
3185  BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3186  BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3187  return Gain > scaleThreshold(BB);
3188 }
3189 
3190 // Find out the predecessors of BB and BB can be beneficially duplicated into
3191 // them.
3192 void MachineBlockPlacement::findDuplicateCandidates(
3195  BlockFilterSet *BlockFilter) {
3196  MachineBasicBlock *Fallthrough = nullptr;
3197  BranchProbability DefaultBranchProb = BranchProbability::getZero();
3198  BlockFrequency BBDupThreshold(scaleThreshold(BB));
3199  SmallVector<MachineBasicBlock *, 8> Preds(BB->predecessors());
3200  SmallVector<MachineBasicBlock *, 8> Succs(BB->successors());
3201 
3202  // Sort for highest frequency.
3203  auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3204  return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3205  };
3206  auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3207  return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3208  };
3209  llvm::stable_sort(Succs, CmpSucc);
3210  llvm::stable_sort(Preds, CmpPred);
3211 
3212  auto SuccIt = Succs.begin();
3213  if (SuccIt != Succs.end()) {
3214  DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3215  }
3216 
3217  // For each predecessors of BB, compute the benefit of duplicating BB,
3218  // if it is larger than the threshold, add it into Candidates.
3219  //
3220  // If we have following control flow.
3221  //
3222  // PB1 PB2 PB3 PB4
3223  // \ | / /\
3224  // \ | / / \
3225  // \ |/ / \
3226  // BB----/ OB
3227  // /\
3228  // / \
3229  // SB1 SB2
3230  //
3231  // And it can be partially duplicated as
3232  //
3233  // PB2+BB
3234  // | PB1 PB3 PB4
3235  // | | / /\
3236  // | | / / \
3237  // | |/ / \
3238  // | BB----/ OB
3239  // |\ /|
3240  // | X |
3241  // |/ \|
3242  // SB2 SB1
3243  //
3244  // The benefit of duplicating into a predecessor is defined as
3245  // Orig_taken_branch - Duplicated_taken_branch
3246  //
3247  // The Orig_taken_branch is computed with the assumption that predecessor
3248  // jumps to BB and the most possible successor is laid out after BB.
3249  //
3250  // The Duplicated_taken_branch is computed with the assumption that BB is
3251  // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3252  // SB2 for PB2 in our case). If there is no available successor, the combined
3253  // block jumps to all BB's successor, like PB3 in this example.
3254  //
3255  // If a predecessor has multiple successors, so BB can't be duplicated into
3256  // it. But it can beneficially fall through to BB, and duplicate BB into other
3257  // predecessors.
3258  for (MachineBasicBlock *Pred : Preds) {
3259  BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3260 
3261  if (!TailDup.canTailDuplicate(BB, Pred)) {
3262  // BB can't be duplicated into Pred, but it is possible to be layout
3263  // below Pred.
3264  if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3265  Fallthrough = Pred;
3266  if (SuccIt != Succs.end())
3267  SuccIt++;
3268  }
3269  continue;
3270  }
3271 
3272  BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3273  BlockFrequency DupCost;
3274  if (SuccIt == Succs.end()) {
3275  // Jump to all successors;
3276  if (Succs.size() > 0)
3277  DupCost += PredFreq;
3278  } else {
3279  // Fallthrough to *SuccIt, jump to all other successors;
3280  DupCost += PredFreq;
3281  DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3282  }
3283 
3284  assert(OrigCost >= DupCost);
3285  OrigCost -= DupCost;
3286  if (OrigCost > BBDupThreshold) {
3287  Candidates.push_back(Pred);
3288  if (SuccIt != Succs.end())
3289  SuccIt++;
3290  }
3291  }
3292 
3293  // No predecessors can optimally fallthrough to BB.
3294  // So we can change one duplication into fallthrough.
3295  if (!Fallthrough) {
3296  if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3297  Candidates[0] = Candidates.back();
3298  Candidates.pop_back();
3299  }
3300  }
3301 }
3302 
3303 void MachineBlockPlacement::initDupThreshold() {
3304  DupThreshold = 0;
3305  if (!F->getFunction().hasProfileData())
3306  return;
3307 
3308  // We prefer to use prifile count.
3309  uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3310  if (HotThreshold != UINT64_MAX) {
3311  UseProfileCount = true;
3312  DupThreshold = HotThreshold * TailDupProfilePercentThreshold / 100;
3313  return;
3314  }
3315 
3316  // Profile count is not available, we can use block frequency instead.
3317  BlockFrequency MaxFreq = 0;
3318  for (MachineBasicBlock &MBB : *F) {
3319  BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3320  if (Freq > MaxFreq)
3321  MaxFreq = Freq;
3322  }
3323 
3324  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3325  DupThreshold = MaxFreq * ThresholdProb;
3326  UseProfileCount = false;
3327 }
3328 
3329 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3330  if (skipFunction(MF.getFunction()))
3331  return false;
3332 
3333  // Check for single-block functions and skip them.
3334  if (std::next(MF.begin()) == MF.end())
3335  return false;
3336 
3337  F = &MF;
3338  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3339  MBFI = std::make_unique<MBFIWrapper>(
3340  getAnalysis<MachineBlockFrequencyInfo>());
3341  MLI = &getAnalysis<MachineLoopInfo>();
3342  TII = MF.getSubtarget().getInstrInfo();
3343  TLI = MF.getSubtarget().getTargetLowering();
3344  MPDT = nullptr;
3345  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
3346 
3347  initDupThreshold();
3348 
3349  // Initialize PreferredLoopExit to nullptr here since it may never be set if
3350  // there are no MachineLoops.
3351  PreferredLoopExit = nullptr;
3352 
3353  assert(BlockToChain.empty() &&
3354  "BlockToChain map should be empty before starting placement.");
3355  assert(ComputedEdges.empty() &&
3356  "Computed Edge map should be empty before starting placement.");
3357 
3358  unsigned TailDupSize = TailDupPlacementThreshold;
3359  // If only the aggressive threshold is explicitly set, use it.
3360  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3361  TailDupPlacementThreshold.getNumOccurrences() == 0)
3363 
3364  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
3365  // For aggressive optimization, we can adjust some thresholds to be less
3366  // conservative.
3367  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
3368  // At O3 we should be more willing to copy blocks for tail duplication. This
3369  // increases size pressure, so we only do it at O3
3370  // Do this unless only the regular threshold is explicitly set.
3371  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3372  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3374  }
3375 
3376  // If there's no threshold provided through options, query the target
3377  // information for a threshold instead.
3378  if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3379  (PassConfig->getOptLevel() < CodeGenOpt::Aggressive ||
3380  TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3381  TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
3382 
3383  if (allowTailDupPlacement()) {
3384  MPDT = &getAnalysis<MachinePostDominatorTree>();
3385  bool OptForSize = MF.getFunction().hasOptSize() ||
3386  llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3387  if (OptForSize)
3388  TailDupSize = 1;
3389  bool PreRegAlloc = false;
3390  TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3391  /* LayoutMode */ true, TailDupSize);
3392  precomputeTriangleChains();
3393  }
3394 
3395  buildCFGChains();
3396 
3397  // Changing the layout can create new tail merging opportunities.
3398  // TailMerge can create jump into if branches that make CFG irreducible for
3399  // HW that requires structured CFG.
3400  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3401  PassConfig->getEnableTailMerge() &&
3403  // No tail merging opportunities if the block number is less than four.
3404  if (MF.size() > 3 && EnableTailMerge) {
3405  unsigned TailMergeSize = TailDupSize + 1;
3406  BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3407  *MBFI, *MBPI, PSI, TailMergeSize);
3408 
3409  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3410  /*AfterPlacement=*/true)) {
3411  // Redo the layout if tail merging creates/removes/moves blocks.
3412  BlockToChain.clear();
3413  ComputedEdges.clear();
3414  // Must redo the post-dominator tree if blocks were changed.
3415  if (MPDT)
3416  MPDT->runOnMachineFunction(MF);
3417  ChainAllocator.DestroyAll();
3418  buildCFGChains();
3419  }
3420  }
3421 
3422  // Apply a post-processing optimizing block placement.
3423  if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
3425  // Find a new placement and modify the layout of the blocks in the function.
3426  applyExtTsp();
3427 
3428  // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3429  createCFGChainExtTsp();
3430  }
3431 
3432  optimizeBranches();
3433  alignBlocks();
3434 
3435  BlockToChain.clear();
3436  ComputedEdges.clear();
3437  ChainAllocator.DestroyAll();
3438 
3439  bool HasMaxBytesOverride =
3440  MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3441 
3442  if (AlignAllBlock)
3443  // Align all of the blocks in the function to a specific alignment.
3444  for (MachineBasicBlock &MBB : MF) {
3445  if (HasMaxBytesOverride)
3448  else
3449  MBB.setAlignment(Align(1ULL << AlignAllBlock));
3450  }
3451  else if (AlignAllNonFallThruBlocks) {
3452  // Align all of the blocks that have no fall-through predecessors to a
3453  // specific alignment.
3454  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
3455  auto LayoutPred = std::prev(MBI);
3456  if (!LayoutPred->isSuccessor(&*MBI)) {
3457  if (HasMaxBytesOverride)
3458  MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3460  else
3461  MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3462  }
3463  }
3464  }
3466  (ViewBlockFreqFuncName.empty() ||
3467  F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
3468  MBFI->view("MBP." + MF.getName(), false);
3469  }
3470 
3471  // We always return true as we have no way to track whether the final order
3472  // differs from the original order.
3473  return true;
3474 }
3475 
3476 void MachineBlockPlacement::applyExtTsp() {
3477  // Prepare data; blocks are indexed by their index in the current ordering.
3479  BlockIndex.reserve(F->size());
3480  std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3481  CurrentBlockOrder.reserve(F->size());
3482  size_t NumBlocks = 0;
3483  for (const MachineBasicBlock &MBB : *F) {
3484  BlockIndex[&MBB] = NumBlocks++;
3485  CurrentBlockOrder.push_back(&MBB);
3486  }
3487 
3488  auto BlockSizes = std::vector<uint64_t>(F->size());
3489  auto BlockCounts = std::vector<uint64_t>(F->size());
3491  for (MachineBasicBlock &MBB : *F) {
3492  // Getting the block frequency.
3493  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3494  BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
3495  // Getting the block size:
3496  // - approximate the size of an instruction by 4 bytes, and
3497  // - ignore debug instructions.
3498  // Note: getting the exact size of each block is target-dependent and can be
3499  // done by extending the interface of MCCodeEmitter. Experimentally we do
3500  // not see a perf improvement with the exact block sizes.
3501  auto NonDbgInsts =
3503  int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3504  BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3505  // Getting jump frequencies.
3506  for (MachineBasicBlock *Succ : MBB.successors()) {
3507  auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3508  BlockFrequency EdgeFreq = BlockFreq * EP;
3509  auto Edge = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]);
3510  JumpCounts[Edge] = EdgeFreq.getFrequency();
3511  }
3512  }
3513 
3514  LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3515  << " with profile = " << F->getFunction().hasProfileData()
3516  << " (" << F->getName().str() << ")"
3517  << "\n");
3518  LLVM_DEBUG(
3519  dbgs() << format(" original layout score: %0.2f\n",
3520  calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
3521 
3522  // Run the layout algorithm.
3523  auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3524  std::vector<const MachineBasicBlock *> NewBlockOrder;
3525  NewBlockOrder.reserve(F->size());
3526  for (uint64_t Node : NewOrder) {
3527  NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3528  }
3529  LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n",
3530  calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
3531  JumpCounts)));
3532 
3533  // Assign new block order.
3534  assignBlockOrder(NewBlockOrder);
3535 }
3536 
3537 void MachineBlockPlacement::assignBlockOrder(
3538  const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3539  assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3540  F->RenumberBlocks();
3541 
3542  bool HasChanges = false;
3543  for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3544  if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3545  HasChanges = true;
3546  break;
3547  }
3548  }
3549  // Stop early if the new block order is identical to the existing one.
3550  if (!HasChanges)
3551  return;
3552 
3553  SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3554  for (auto &MBB : *F) {
3555  PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3556  }
3557 
3558  // Sort basic blocks in the function according to the computed order.
3560  for (const MachineBasicBlock *MBB : NewBlockOrder) {
3561  NewIndex[MBB] = NewIndex.size();
3562  }
3563  F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3564  return NewIndex[&L] < NewIndex[&R];
3565  });
3566 
3567  // Update basic block branches by inserting explicit fallthrough branches
3568  // when required and re-optimize branches when possible.
3569  const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3571  for (auto &MBB : *F) {
3572  MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3574  auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3575  // If this block had a fallthrough before we need an explicit unconditional
3576  // branch to that block if the fallthrough block is not adjacent to the
3577  // block in the new order.
3578  if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3579  TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3580  }
3581 
3582  // It might be possible to optimize branches by flipping the condition.
3583  Cond.clear();
3584  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3585  if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3586  continue;
3587  MBB.updateTerminator(FTMBB);
3588  }
3589 
3590 #ifndef NDEBUG
3591  // Make sure we correctly constructed all branches.
3592  F->verify(this, "After optimized block reordering");
3593 #endif
3594 }
3595 
3596 void MachineBlockPlacement::createCFGChainExtTsp() {
3597  BlockToChain.clear();
3598  ComputedEdges.clear();
3599  ChainAllocator.DestroyAll();
3600 
3601  MachineBasicBlock *HeadBB = &F->front();
3602  BlockChain *FunctionChain =
3603  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3604 
3605  for (MachineBasicBlock &MBB : *F) {
3606  if (HeadBB == &MBB)
3607  continue; // Ignore head of the chain
3608  FunctionChain->merge(&MBB, nullptr);
3609  }
3610 }
3611 
3612 namespace {
3613 
3614 /// A pass to compute block placement statistics.
3615 ///
3616 /// A separate pass to compute interesting statistics for evaluating block
3617 /// placement. This is separate from the actual placement pass so that they can
3618 /// be computed in the absence of any placement transformations or when using
3619 /// alternative placement strategies.
3620 class MachineBlockPlacementStats : public MachineFunctionPass {
3621  /// A handle to the branch probability pass.
3622  const MachineBranchProbabilityInfo *MBPI;
3623 
3624  /// A handle to the function-wide block frequency pass.
3625  const MachineBlockFrequencyInfo *MBFI;
3626 
3627 public:
3628  static char ID; // Pass identification, replacement for typeid
3629 
3630  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3632  }
3633 
3634  bool runOnMachineFunction(MachineFunction &F) override;
3635 
3636  void getAnalysisUsage(AnalysisUsage &AU) const override {
3639  AU.setPreservesAll();
3641  }
3642 };
3643 
3644 } // end anonymous namespace
3645 
3647 
3649 
3650 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3651  "Basic Block Placement Stats", false, false)
3654 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3655  "Basic Block Placement Stats", false, false)
3656 
3657 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3658  // Check for single-block functions and skip them.
3659  if (std::next(F.begin()) == F.end())
3660  return false;
3661 
3662  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3663  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
3664 
3665  for (MachineBasicBlock &MBB : F) {
3666  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3667  Statistic &NumBranches =
3668  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3669  Statistic &BranchTakenFreq =
3670  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3671  for (MachineBasicBlock *Succ : MBB.successors()) {
3672  // Skip if this successor is a fallthrough.
3673  if (MBB.isLayoutSuccessor(Succ))
3674  continue;
3675 
3676  BlockFrequency EdgeFreq =
3677  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3678  ++NumBranches;
3679  BranchTakenFreq += EdgeFreq.getFrequency();
3680  }
3681  }
3682 
3683  return false;
3684 }
llvm::TargetMachine::requiresStructuredCFG
bool requiresStructuredCFG() const
Definition: TargetMachine.h:216
llvm::BranchFolder
Definition: BranchFolding.h:31
i
i
Definition: README.txt:29
block
we get the following basic block
Definition: README_ALTIVEC.txt:95
llvm::MachineBasicBlock::succ_size
unsigned succ_size() const
Definition: MachineBasicBlock.h:353
llvm::MachineBasicBlock::pred_begin
pred_iterator pred_begin()
Definition: MachineBasicBlock.h:325
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:104
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:494
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:724
llvm::TargetLoweringBase
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
Definition: TargetLowering.h:191
llvm::TailDuplicator::initMF
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
Definition: TailDuplicator.cpp:79
llvm::MachineLoopInfo::getLoopFor
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
Definition: MachineLoopInfo.h:126
llvm::HexagonInstrInfo::removeBranch
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
Definition: HexagonInstrInfo.cpp:602
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:199
llvm::BranchProbability::getNumerator
uint32_t getNumerator() const
Definition: BranchProbability.h:64
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
Pass.h
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:122
llvm::TargetSubtargetInfo::getInstrInfo
virtual const TargetInstrInfo * getInstrInfo() const
Definition: TargetSubtargetInfo.h:93
llvm::HexagonInstrInfo::analyzeBranch
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
Definition: HexagonInstrInfo.cpp:432
llvm::raw_string_ostream
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:632
getAdjustedProbability
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
Definition: MachineBlockPlacement.cpp:737
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
Statistic.h
llvm::MachineFunction::end
iterator end()
Definition: MachineFunction.h:824
llvm::ProfileSummaryInfo::getOrCompHotCountThreshold
uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
Definition: ProfileSummaryInfo.cpp:317
llvm::printMBBReference
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Definition: MachineBasicBlock.cpp:116
Allocator.h
llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:1797
llvm::MachineBasicBlock::isEntryBlock
bool isEntryBlock() const
Returns true if this is the entry block of the function.
Definition: MachineBasicBlock.cpp:286
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
MachineSizeOpts.h
llvm::initializeMachineBlockPlacementPass
void initializeMachineBlockPlacementPass(PassRegistry &)
MachineBasicBlock.h
llvm::TargetLoweringBase::getPrefLoopAlignment
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
Definition: TargetLoweringBase.cpp:1998
llvm::TargetSubtargetInfo::getRegisterInfo
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
Definition: TargetSubtargetInfo.h:125
EnableExtTspBlockPlacement
cl::opt< bool > EnableExtTspBlockPlacement
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::erase
bool erase(const KeyT &Val)
Definition: DenseMap.h:304
AlignAllBlock
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::begin
iterator begin()
Definition: DenseMap.h:75
InstrCount
static unsigned InstrCount
Definition: DFAPacketizer.cpp:53
llvm::SpecificBumpPtrAllocator
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:379
llvm::BlockFrequency::getMaxFrequency
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
Definition: BlockFrequency.h:30
BlockFrequency.h
DenseMap.h
llvm::BranchProbability::getZero
static BranchProbability getZero()
Definition: BranchProbability.h:48
llvm::reverse
auto reverse(ContainerTy &&C, std::enable_if_t< has_rbegin< ContainerTy >::value > *=nullptr)
Definition: STLExtras.h:380
TargetInstrInfo.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::TailDuplicator::shouldTailDuplicate
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
Definition: TailDuplicator.cpp:551
getLayoutSuccessorProbThreshold
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
Definition: MachineBlockPlacement.cpp:1366
MBFIWrapper.h
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:147
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877
STLExtras.h
llvm::MachineBasicBlock::back
MachineInstr & back()
Definition: MachineBasicBlock.h:257
llvm::TargetLoweringBase::alignLoopsWithOptSize
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
Definition: TargetLowering.h:1831
llvm::SPIRV::ImageOperand::Bias
@ Bias
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
llvm::MachineBlockFrequencyInfo::getBlockFreq
BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
Definition: MachineBlockFrequencyInfo.cpp:230
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:103
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:240
llvm::MachineLoopInfo
Definition: MachineLoopInfo.h:89
llvm::MachineBasicBlock::succ_rbegin
succ_reverse_iterator succ_rbegin()
Definition: MachineBasicBlock.h:345
llvm::LoopBase::block_end
block_iterator block_end() const
Definition: LoopInfo.h:177
llvm::MachineBasicBlock::dump
void dump() const
Definition: MachineBasicBlock.cpp:291
llvm::TailDuplicator::isSimpleBB
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
Definition: TailDuplicator.cpp:681
llvm::sys::path::append
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:456
TailDupPlacementThreshold
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CommandLine.h
llvm::MachineBasicBlock::pred_size
unsigned pred_size() const
Definition: MachineBasicBlock.h:337
TargetLowering.h
llvm::shouldOptimizeForSize
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
Definition: MachineSizeOpts.cpp:183
BlockFrequencyInfoImpl.h
llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition: TargetInstrInfo.h:97
MachineLoopInfo.h
TargetMachine.h
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::MutableArrayRef
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:306
llvm::MachineBlockFrequencyInfo
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
Definition: MachineBlockFrequencyInfo.h:33
llvm::MachineBranchProbabilityInfo
Definition: MachineBranchProbabilityInfo.h:22
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::MachineBasicBlock::isSuccessor
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
Definition: MachineBasicBlock.cpp:909
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
llvm::TargetPassConfig::getEnableTailMerge
bool getEnableTailMerge() const
Definition: TargetPassConfig.h:176
UINT64_MAX
#define UINT64_MAX
Definition: DataTypes.h:77
llvm::TargetLoweringBase::getMaxPermittedBytesForAlignment
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
Definition: TargetLoweringBase.cpp:2004
false
Definition: StackSlotColoring.cpp:141
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:127
merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition: LoopDeletion.cpp:53
llvm::MachineFunction::size
unsigned size() const
Definition: MachineFunction.h:832
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::StaticLikelyProb
cl::opt< unsigned > StaticLikelyProb
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
AlignAllNonFallThruBlocks
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
llvm::HexagonInstrInfo::insertBranch
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
Definition: HexagonInstrInfo.cpp:625
llvm::raw_ostream::flush
void flush()
Definition: raw_ostream.h:187
llvm::MachineFunction::begin
iterator begin()
Definition: MachineFunction.h:822
llvm::ProfileLikelyProb
cl::opt< unsigned > ProfileLikelyProb
DebugLoc.h
SmallPtrSet.h
JumpInstCost
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
BranchFolding.h
TailDupPlacement
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
Align
uint64_t Align
Definition: ELFObjHandler.cpp:81
CodeLayout.h
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::erase_value
void erase_value(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:1805
BranchProbability.h
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:39
BranchFoldPlacement
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
llvm::MachineBasicBlock::setAlignment
void setAlignment(Align A)
Set alignment of the basic block.
Definition: MachineBasicBlock.h:531
llvm::LoopBase::block_begin
block_iterator block_begin() const
Definition: LoopInfo.h:176
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:36
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:640
llvm::cl::opt
Definition: CommandLine.h:1392
TriangleChainCount
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
TailDupProfilePercentThreshold
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
llvm::MachineLoop
Definition: MachineLoopInfo.h:44
llvm::BlockFrequency
Definition: BlockFrequency.h:23
TailMergeSize
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::clear
void clear()
Definition: DenseMap.h:112
llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1699
MaxBytesForAlignmentOverride
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
uint64_t
ProfileSummaryInfo.h
llvm::ViewBlockLayoutWithBFI
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
Definition: MachineBlockPlacement.cpp:213
llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1627
ExitBlockBias
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
llvm::DenseMap
Definition: DenseMap.h:716
llvm::TailDuplicator::tailDuplicateAndUpdate
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
Definition: TailDuplicator.cpp:155
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::BlockFrequency::getFrequency
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
Definition: BlockFrequency.h:34
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
greaterWithBias
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
Definition: MachineBlockPlacement.cpp:785
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:193
ArrayRef.h
TargetPassConfig.h
MachineFunctionPass.h
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:152
llvm::SmallVectorImpl::const_iterator
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:559
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineBasicBlock::canFallThrough
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
Definition: MachineBasicBlock.cpp:962
LoopToColdBlockRatio
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
MachineBranchProbabilityInfo.h
llvm::MachineBasicBlock::succ_begin
succ_iterator succ_begin()
Definition: MachineBasicBlock.h:341
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:234
hasSameSuccessors
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
Definition: MachineBlockPlacement.cpp:752
MachinePostDominators.h
llvm::MachineBasicBlock::predecessors
iterator_range< pred_iterator > predecessors()
Definition: MachineBasicBlock.h:358
llvm::MachineBranchProbabilityInfo::getEdgeProbability
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
Definition: MachineBranchProbabilityInfo.cpp:56
llvm::CodeGenOpt::Aggressive
@ Aggressive
Definition: CodeGen.h:56
llvm::MachineBasicBlock::instr_begin
instr_iterator instr_begin()
Definition: MachineBasicBlock.h:261
llvm::MachineBlockPlacementStatsID
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
Definition: MachineBlockPlacement.cpp:3648
MisfetchCost
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:263
llvm::MachineFunction
Definition: MachineFunction.h:241
llvm::BranchProbability::getOne
static BranchProbability getOne()
Definition: BranchProbability.h:49
llvm::MachineBasicBlock::succ_empty
bool succ_empty() const
Definition: MachineBasicBlock.h:356
llvm::sys::fs::remove
std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
llvm::MachineBasicBlock::getNumber
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
Definition: MachineBasicBlock.h:1078
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:178
llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition: MachineBasicBlock.h:364
llvm::MachineBasicBlock::isEHPad
bool isEHPad() const
Returns true if the block is a landing pad.
Definition: MachineBasicBlock.h:548
llvm::MachineBasicBlock::findBranchDebugLoc
DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
Definition: MachineBasicBlock.cpp:1408
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
uint32_t
llvm::BranchProbability
Definition: BranchProbability.h:29
Compiler.h
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
TargetSubtargetInfo.h
llvm::MachineBlockPlacementID
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
Definition: MachineBlockPlacement.cpp:609
PreciseRotationCost
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
llvm::format
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
llvm::TrackingStatistic
Definition: Statistic.h:50
for
this could be done in SelectionDAGISel along with other special for
Definition: README.txt:104
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:209
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
llvm::SpecificBumpPtrAllocator::Allocate
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:429
llvm::SmallPtrSetImplBase::size
size_type size() const
Definition: SmallPtrSet.h:93
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:663
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::empty
LLVM_NODISCARD bool empty() const
Definition: DenseMap.h:98
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::MachinePostDominatorTree::dominates
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Definition: MachinePostDominators.h:54
getBlockName
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
Definition: MachineBlockPlacement.cpp:625
llvm::calcExtTspScore
double calcExtTspScore(const std::vector< uint64_t > &Order, const std::vector< uint64_t > &NodeSizes, const std::vector< uint64_t > &NodeCounts, const DenseMap< std::pair< uint64_t, uint64_t >, uint64_t > &EdgeCounts)
Estimate the "quality" of a given node order in CFG.
Definition: CodeLayout.cpp:932
stats
Definition: MCAssembler.cpp:53
llvm::stable_sort
void stable_sort(R &&Range)
Definition: STLExtras.h:1751
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:606
Stats
block placement Basic Block Placement Stats
Definition: MachineBlockPlacement.cpp:3655
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
ForcePreciseRotationCost
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
llvm::MachineFunction::getTarget
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition: MachineFunction.h:636
rotate
The same transformation can work with an even modulo with the addition of a rotate
Definition: README.txt:680
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::end
iterator end()
Definition: DenseMap.h:84
llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::MCID::Branch
@ Branch
Definition: MCInstrDesc.h:158
Function.h
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::size
unsigned size() const
Definition: DenseMap.h:101
llvm::initializeMachineBlockPlacementStatsPass
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
llvm::Function::hasProfileData
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:290
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:591
llvm::MachineBasicBlock::isLayoutSuccessor
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
Definition: MachineBasicBlock.cpp:913
ForceLoopColdBlock
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
llvm::BranchProbability::getCompl
BranchProbability getCompl() const
Definition: BranchProbability.h:68
llvm::MachineBasicBlock::getFallThrough
MachineBasicBlock * getFallThrough()
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
Definition: MachineBasicBlock.cpp:918
llvm::MachinePostDominatorTree::runOnMachineFunction
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Definition: MachinePostDominators.cpp:40
CodeGen.h
llvm::applyExtTspLayout
std::vector< uint64_t > applyExtTspLayout(const std::vector< uint64_t > &NodeSizes, const std::vector< uint64_t > &NodeCounts, const DenseMap< std::pair< uint64_t, uint64_t >, uint64_t > &EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
Definition: CodeLayout.cpp:911
llvm::TailDuplicator::canTailDuplicate
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
Definition: TailDuplicator.cpp:790
llvm::HexagonInstrInfo::reverseBranchCondition
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
Definition: HexagonInstrInfo.cpp:1625
llvm::TargetSubtargetInfo::getTargetLowering
virtual const TargetLowering * getTargetLowering() const
Definition: TargetSubtargetInfo.h:97
Placement
Branch Probability Basic Block Placement
Definition: MachineBlockPlacement.cpp:619
SmallVector.h
llvm::TailDuplicator
Utility class to perform tail duplication.
Definition: TailDuplicator.h:39
llvm::GVDT_None
@ GVDT_None
Definition: BlockFrequencyInfoImpl.h:1805
DEBUG_TYPE
#define DEBUG_TYPE
Definition: MachineBlockPlacement.cpp:77
llvm::SmallVectorImpl::iterator
typename SuperClass::iterator iterator
Definition: SmallVector.h:558
llvm::SmallPtrSetImplBase::empty
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:92
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:224
TailDuplicator.h
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::reserve
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:105
TailDupPlacementPenalty
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
ApplyExtTspWithoutProfile
cl::opt< bool > ApplyExtTspWithoutProfile
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:307
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::ViewBlockFreqFuncName
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
Definition: MachineBlockFrequencyInfo.cpp:66
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:405
llvm::MachineBasicBlock::updateTerminator
void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
Definition: MachineBasicBlock.cpp:642
raw_ostream.h
TailDupPlacementAggressiveThreshold
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
MachineFunction.h
InitializePasses.h
MachineBlockFrequencyInfo.h
Debug.h
llvm::TargetPassConfig::getOptLevel
CodeGenOpt::Level getOptLevel() const
Definition: TargetPassConfig.cpp:645
llvm::instructionsWithoutDebug
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
Definition: MachineBasicBlock.h:1266
SetVector.h
llvm::SpecificBumpPtrAllocator::DestroyAll
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:400
countMBBInstruction
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
Definition: MachineBlockPlacement.cpp:3137
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:927
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37