LLVM 19.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements basic block placement transformations using the CFG
10// structure and branch probability estimates.
11//
12// The pass strives to preserve the structure of the CFG (that is, retain
13// a topological ordering of basic blocks) in the absence of a *strong* signal
14// to the contrary from probabilities. However, within the CFG structure, it
15// attempts to choose an ordering which favors placing more likely sequences of
16// blocks adjacent to each other.
17//
18// The algorithm works from the inner-most loop within a function outward, and
19// at each stage walks through the basic blocks, trying to coalesce them into
20// sequential chains where allowed by the CFG (or demanded by heavy
21// probabilities). Finally, it walks the blocks in topological order, and the
22// first time it reaches a chain of basic blocks, it schedules them in the
23// function in-order.
24//
25//===----------------------------------------------------------------------===//
26
27#include "BranchFolding.h"
28#include "llvm/ADT/ArrayRef.h"
29#include "llvm/ADT/DenseMap.h"
30#include "llvm/ADT/STLExtras.h"
31#include "llvm/ADT/SetVector.h"
34#include "llvm/ADT/Statistic.h"
51#include "llvm/IR/DebugLoc.h"
52#include "llvm/IR/Function.h"
53#include "llvm/IR/PrintPasses.h"
55#include "llvm/Pass.h"
62#include "llvm/Support/Debug.h"
66#include <algorithm>
67#include <cassert>
68#include <cstdint>
69#include <iterator>
70#include <memory>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76using namespace llvm;
77
78#define DEBUG_TYPE "block-placement"
79
80STATISTIC(NumCondBranches, "Number of conditional branches");
81STATISTIC(NumUncondBranches, "Number of unconditional branches");
82STATISTIC(CondBranchTakenFreq,
83 "Potential frequency of taking conditional branches");
84STATISTIC(UncondBranchTakenFreq,
85 "Potential frequency of taking unconditional branches");
86
88 "align-all-blocks",
89 cl::desc("Force the alignment of all blocks in the function in log2 format "
90 "(e.g 4 means align on 16B boundaries)."),
92
94 "align-all-nofallthru-blocks",
95 cl::desc("Force the alignment of all blocks that have no fall-through "
96 "predecessors (i.e. don't add nops that are executed). In log2 "
97 "format (e.g 4 means align on 16B boundaries)."),
99
101 "max-bytes-for-alignment",
102 cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
103 "alignment"),
104 cl::init(0), cl::Hidden);
105
106// FIXME: Find a good default for this flag and remove the flag.
108 "block-placement-exit-block-bias",
109 cl::desc("Block frequency percentage a loop exit block needs "
110 "over the original exit to be considered the new exit."),
111 cl::init(0), cl::Hidden);
112
113// Definition:
114// - Outlining: placement of a basic block outside the chain or hot path.
115
117 "loop-to-cold-block-ratio",
118 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
119 "(frequency of block) is greater than this ratio"),
120 cl::init(5), cl::Hidden);
121
123 "force-loop-cold-block",
124 cl::desc("Force outlining cold blocks from loops."),
125 cl::init(false), cl::Hidden);
126
127static cl::opt<bool>
128 PreciseRotationCost("precise-rotation-cost",
129 cl::desc("Model the cost of loop rotation more "
130 "precisely by using profile data."),
131 cl::init(false), cl::Hidden);
132
133static cl::opt<bool>
134 ForcePreciseRotationCost("force-precise-rotation-cost",
135 cl::desc("Force the use of precise cost "
136 "loop rotation strategy."),
137 cl::init(false), cl::Hidden);
138
140 "misfetch-cost",
141 cl::desc("Cost that models the probabilistic risk of an instruction "
142 "misfetch due to a jump comparing to falling through, whose cost "
143 "is zero."),
144 cl::init(1), cl::Hidden);
145
146static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
147 cl::desc("Cost of jump instructions."),
148 cl::init(1), cl::Hidden);
149static cl::opt<bool>
150TailDupPlacement("tail-dup-placement",
151 cl::desc("Perform tail duplication during placement. "
152 "Creates more fallthrough opportunites in "
153 "outline branches."),
154 cl::init(true), cl::Hidden);
155
156static cl::opt<bool>
157BranchFoldPlacement("branch-fold-placement",
158 cl::desc("Perform branch folding during placement. "
159 "Reduces code size."),
160 cl::init(true), cl::Hidden);
161
162// Heuristic for tail duplication.
164 "tail-dup-placement-threshold",
165 cl::desc("Instruction cutoff for tail duplication during layout. "
166 "Tail merging during layout is forced to have a threshold "
167 "that won't conflict."), cl::init(2),
168 cl::Hidden);
169
170// Heuristic for aggressive tail duplication.
172 "tail-dup-placement-aggressive-threshold",
173 cl::desc("Instruction cutoff for aggressive tail duplication during "
174 "layout. Used at -O3. Tail merging during layout is forced to "
175 "have a threshold that won't conflict."), cl::init(4),
176 cl::Hidden);
177
178// Heuristic for tail duplication.
180 "tail-dup-placement-penalty",
181 cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
182 "Copying can increase fallthrough, but it also increases icache "
183 "pressure. This parameter controls the penalty to account for that. "
184 "Percent as integer."),
185 cl::init(2),
186 cl::Hidden);
187
188// Heuristic for tail duplication if profile count is used in cost model.
190 "tail-dup-profile-percent-threshold",
191 cl::desc("If profile count information is used in tail duplication cost "
192 "model, the gained fall through number from tail duplication "
193 "should be at least this percent of hot count."),
194 cl::init(50), cl::Hidden);
195
196// Heuristic for triangle chains.
198 "triangle-chain-count",
199 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
200 "triangle tail duplication heuristic to kick in. 0 to disable."),
201 cl::init(2),
202 cl::Hidden);
203
204// Use case: When block layout is visualized after MBP pass, the basic blocks
205// are labeled in layout order; meanwhile blocks could be numbered in a
206// different order. It's hard to map between the graph and pass output.
207// With this option on, the basic blocks are renumbered in function layout
208// order. For debugging only.
210 "renumber-blocks-before-view",
211 cl::desc(
212 "If true, basic blocks are re-numbered before MBP layout is printed "
213 "into a dot graph. Only used when a function is being printed."),
214 cl::init(false), cl::Hidden);
215
216namespace llvm {
221
222// Internal option used to control BFI display only after MBP pass.
223// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
224// -view-block-layout-with-bfi=
226
227// Command line option to specify the name of the function for CFG dump
228// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
230} // namespace llvm
231
232namespace {
233
234class BlockChain;
235
236/// Type for our function-wide basic block -> block chain mapping.
237using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
238
239/// A chain of blocks which will be laid out contiguously.
240///
241/// This is the datastructure representing a chain of consecutive blocks that
242/// are profitable to layout together in order to maximize fallthrough
243/// probabilities and code locality. We also can use a block chain to represent
244/// a sequence of basic blocks which have some external (correctness)
245/// requirement for sequential layout.
246///
247/// Chains can be built around a single basic block and can be merged to grow
248/// them. They participate in a block-to-chain mapping, which is updated
249/// automatically as chains are merged together.
250class BlockChain {
251 /// The sequence of blocks belonging to this chain.
252 ///
253 /// This is the sequence of blocks for a particular chain. These will be laid
254 /// out in-order within the function.
256
257 /// A handle to the function-wide basic block to block chain mapping.
258 ///
259 /// This is retained in each block chain to simplify the computation of child
260 /// block chains for SCC-formation and iteration. We store the edges to child
261 /// basic blocks, and map them back to their associated chains using this
262 /// structure.
263 BlockToChainMapType &BlockToChain;
264
265public:
266 /// Construct a new BlockChain.
267 ///
268 /// This builds a new block chain representing a single basic block in the
269 /// function. It also registers itself as the chain that block participates
270 /// in with the BlockToChain mapping.
271 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
272 : Blocks(1, BB), BlockToChain(BlockToChain) {
273 assert(BB && "Cannot create a chain with a null basic block");
274 BlockToChain[BB] = this;
275 }
276
277 /// Iterator over blocks within the chain.
280
281 /// Beginning of blocks within the chain.
282 iterator begin() { return Blocks.begin(); }
283 const_iterator begin() const { return Blocks.begin(); }
284
285 /// End of blocks within the chain.
286 iterator end() { return Blocks.end(); }
287 const_iterator end() const { return Blocks.end(); }
288
289 bool remove(MachineBasicBlock* BB) {
290 for(iterator i = begin(); i != end(); ++i) {
291 if (*i == BB) {
292 Blocks.erase(i);
293 return true;
294 }
295 }
296 return false;
297 }
298
299 /// Merge a block chain into this one.
300 ///
301 /// This routine merges a block chain into this one. It takes care of forming
302 /// a contiguous sequence of basic blocks, updating the edge list, and
303 /// updating the block -> chain mapping. It does not free or tear down the
304 /// old chain, but the old chain's block list is no longer valid.
305 void merge(MachineBasicBlock *BB, BlockChain *Chain) {
306 assert(BB && "Can't merge a null block.");
307 assert(!Blocks.empty() && "Can't merge into an empty chain.");
308
309 // Fast path in case we don't have a chain already.
310 if (!Chain) {
311 assert(!BlockToChain[BB] &&
312 "Passed chain is null, but BB has entry in BlockToChain.");
313 Blocks.push_back(BB);
314 BlockToChain[BB] = this;
315 return;
316 }
317
318 assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
319 assert(Chain->begin() != Chain->end());
320
321 // Update the incoming blocks to point to this chain, and add them to the
322 // chain structure.
323 for (MachineBasicBlock *ChainBB : *Chain) {
324 Blocks.push_back(ChainBB);
325 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
326 BlockToChain[ChainBB] = this;
327 }
328 }
329
330#ifndef NDEBUG
331 /// Dump the blocks in this chain.
332 LLVM_DUMP_METHOD void dump() {
333 for (MachineBasicBlock *MBB : *this)
334 MBB->dump();
335 }
336#endif // NDEBUG
337
338 /// Count of predecessors of any block within the chain which have not
339 /// yet been scheduled. In general, we will delay scheduling this chain
340 /// until those predecessors are scheduled (or we find a sufficiently good
341 /// reason to override this heuristic.) Note that when forming loop chains,
342 /// blocks outside the loop are ignored and treated as if they were already
343 /// scheduled.
344 ///
345 /// Note: This field is reinitialized multiple times - once for each loop,
346 /// and then once for the function as a whole.
347 unsigned UnscheduledPredecessors = 0;
348};
349
350class MachineBlockPlacement : public MachineFunctionPass {
351 /// A type for a block filter set.
353
354 /// Pair struct containing basic block and taildup profitability
355 struct BlockAndTailDupResult {
356 MachineBasicBlock *BB = nullptr;
357 bool ShouldTailDup;
358 };
359
360 /// Triple struct containing edge weight and the edge.
361 struct WeightedEdge {
362 BlockFrequency Weight;
363 MachineBasicBlock *Src = nullptr;
364 MachineBasicBlock *Dest = nullptr;
365 };
366
367 /// work lists of blocks that are ready to be laid out
370
371 /// Edges that have already been computed as optimal.
373
374 /// Machine Function
375 MachineFunction *F = nullptr;
376
377 /// A handle to the branch probability pass.
378 const MachineBranchProbabilityInfo *MBPI = nullptr;
379
380 /// A handle to the function-wide block frequency pass.
381 std::unique_ptr<MBFIWrapper> MBFI;
382
383 /// A handle to the loop info.
384 MachineLoopInfo *MLI = nullptr;
385
386 /// Preferred loop exit.
387 /// Member variable for convenience. It may be removed by duplication deep
388 /// in the call stack.
389 MachineBasicBlock *PreferredLoopExit = nullptr;
390
391 /// A handle to the target's instruction info.
392 const TargetInstrInfo *TII = nullptr;
393
394 /// A handle to the target's lowering info.
395 const TargetLoweringBase *TLI = nullptr;
396
397 /// A handle to the post dominator tree.
398 MachinePostDominatorTree *MPDT = nullptr;
399
400 ProfileSummaryInfo *PSI = nullptr;
401
402 /// Duplicator used to duplicate tails during placement.
403 ///
404 /// Placement decisions can open up new tail duplication opportunities, but
405 /// since tail duplication affects placement decisions of later blocks, it
406 /// must be done inline.
407 TailDuplicator TailDup;
408
409 /// Partial tail duplication threshold.
410 BlockFrequency DupThreshold;
411
412 /// True: use block profile count to compute tail duplication cost.
413 /// False: use block frequency to compute tail duplication cost.
414 bool UseProfileCount = false;
415
416 /// Allocator and owner of BlockChain structures.
417 ///
418 /// We build BlockChains lazily while processing the loop structure of
419 /// a function. To reduce malloc traffic, we allocate them using this
420 /// slab-like allocator, and destroy them after the pass completes. An
421 /// important guarantee is that this allocator produces stable pointers to
422 /// the chains.
424
425 /// Function wide BasicBlock to BlockChain mapping.
426 ///
427 /// This mapping allows efficiently moving from any given basic block to the
428 /// BlockChain it participates in, if any. We use it to, among other things,
429 /// allow implicitly defining edges between chains as the existing edges
430 /// between basic blocks.
432
433#ifndef NDEBUG
434 /// The set of basic blocks that have terminators that cannot be fully
435 /// analyzed. These basic blocks cannot be re-ordered safely by
436 /// MachineBlockPlacement, and we must preserve physical layout of these
437 /// blocks and their successors through the pass.
438 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
439#endif
440
441 /// Get block profile count or frequency according to UseProfileCount.
442 /// The return value is used to model tail duplication cost.
443 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
444 if (UseProfileCount) {
445 auto Count = MBFI->getBlockProfileCount(BB);
446 if (Count)
447 return BlockFrequency(*Count);
448 else
449 return BlockFrequency(0);
450 } else
451 return MBFI->getBlockFreq(BB);
452 }
453
454 /// Scale the DupThreshold according to basic block size.
455 BlockFrequency scaleThreshold(MachineBasicBlock *BB);
456 void initDupThreshold();
457
458 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
459 /// if the count goes to 0, add them to the appropriate work list.
460 void markChainSuccessors(
461 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
462 const BlockFilterSet *BlockFilter = nullptr);
463
464 /// Decrease the UnscheduledPredecessors count for a single block, and
465 /// if the count goes to 0, add them to the appropriate work list.
466 void markBlockSuccessors(
467 const BlockChain &Chain, const MachineBasicBlock *BB,
468 const MachineBasicBlock *LoopHeaderBB,
469 const BlockFilterSet *BlockFilter = nullptr);
470
472 collectViableSuccessors(
473 const MachineBasicBlock *BB, const BlockChain &Chain,
474 const BlockFilterSet *BlockFilter,
476 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
477 BlockFilterSet *BlockFilter);
478 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
480 BlockFilterSet *BlockFilter);
481 bool repeatedlyTailDuplicateBlock(
483 const MachineBasicBlock *LoopHeaderBB,
484 BlockChain &Chain, BlockFilterSet *BlockFilter,
485 MachineFunction::iterator &PrevUnplacedBlockIt);
486 bool maybeTailDuplicateBlock(
488 BlockChain &Chain, BlockFilterSet *BlockFilter,
489 MachineFunction::iterator &PrevUnplacedBlockIt,
490 bool &DuplicatedToLPred);
491 bool hasBetterLayoutPredecessor(
492 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
493 const BlockChain &SuccChain, BranchProbability SuccProb,
494 BranchProbability RealSuccProb, const BlockChain &Chain,
495 const BlockFilterSet *BlockFilter);
496 BlockAndTailDupResult selectBestSuccessor(
497 const MachineBasicBlock *BB, const BlockChain &Chain,
498 const BlockFilterSet *BlockFilter);
499 MachineBasicBlock *selectBestCandidateBlock(
500 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
501 MachineBasicBlock *getFirstUnplacedBlock(
502 const BlockChain &PlacedChain,
503 MachineFunction::iterator &PrevUnplacedBlockIt,
504 const BlockFilterSet *BlockFilter);
505
506 /// Add a basic block to the work list if it is appropriate.
507 ///
508 /// If the optional parameter BlockFilter is provided, only MBB
509 /// present in the set will be added to the worklist. If nullptr
510 /// is provided, no filtering occurs.
511 void fillWorkLists(const MachineBasicBlock *MBB,
512 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
513 const BlockFilterSet *BlockFilter);
514
515 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
516 BlockFilterSet *BlockFilter = nullptr);
517 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
518 const MachineBasicBlock *OldTop);
519 bool hasViableTopFallthrough(const MachineBasicBlock *Top,
520 const BlockFilterSet &LoopBlockSet);
521 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
522 const BlockFilterSet &LoopBlockSet);
523 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
524 const MachineBasicBlock *OldTop,
525 const MachineBasicBlock *ExitBB,
526 const BlockFilterSet &LoopBlockSet);
527 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
528 const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
529 MachineBasicBlock *findBestLoopTop(
530 const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
531 MachineBasicBlock *findBestLoopExit(
532 const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
533 BlockFrequency &ExitFreq);
534 BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
535 void buildLoopChains(const MachineLoop &L);
536 void rotateLoop(
537 BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
538 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
539 void rotateLoopWithProfile(
540 BlockChain &LoopChain, const MachineLoop &L,
541 const BlockFilterSet &LoopBlockSet);
542 void buildCFGChains();
543 void optimizeBranches();
544 void alignBlocks();
545 /// Returns true if a block should be tail-duplicated to increase fallthrough
546 /// opportunities.
547 bool shouldTailDuplicate(MachineBasicBlock *BB);
548 /// Check the edge frequencies to see if tail duplication will increase
549 /// fallthroughs.
550 bool isProfitableToTailDup(
551 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
552 BranchProbability QProb,
553 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
554
555 /// Check for a trellis layout.
556 bool isTrellis(const MachineBasicBlock *BB,
557 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
558 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
559
560 /// Get the best successor given a trellis layout.
561 BlockAndTailDupResult getBestTrellisSuccessor(
562 const MachineBasicBlock *BB,
563 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
564 BranchProbability AdjustedSumProb, const BlockChain &Chain,
565 const BlockFilterSet *BlockFilter);
566
567 /// Get the best pair of non-conflicting edges.
568 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
569 const MachineBasicBlock *BB,
571
572 /// Returns true if a block can tail duplicate into all unplaced
573 /// predecessors. Filters based on loop.
574 bool canTailDuplicateUnplacedPreds(
575 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
576 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
577
578 /// Find chains of triangles to tail-duplicate where a global analysis works,
579 /// but a local analysis would not find them.
580 void precomputeTriangleChains();
581
582 /// Apply a post-processing step optimizing block placement.
583 void applyExtTsp();
584
585 /// Modify the existing block placement in the function and adjust all jumps.
586 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
587
588 /// Create a single CFG chain from the current block order.
589 void createCFGChainExtTsp();
590
591public:
592 static char ID; // Pass identification, replacement for typeid
593
594 MachineBlockPlacement() : MachineFunctionPass(ID) {
596 }
597
598 bool runOnMachineFunction(MachineFunction &F) override;
599
600 bool allowTailDupPlacement() const {
601 assert(F);
602 return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
603 }
604
605 void getAnalysisUsage(AnalysisUsage &AU) const override {
614 }
615};
616
617} // end anonymous namespace
618
619char MachineBlockPlacement::ID = 0;
620
621char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
622
623INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
624 "Branch Probability Basic Block Placement", false, false)
630INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
631 "Branch Probability Basic Block Placement", false, false)
632
633#ifndef NDEBUG
634/// Helper to print the name of a MBB.
635///
636/// Only used by debug logging.
637static std::string getBlockName(const MachineBasicBlock *BB) {
638 std::string Result;
639 raw_string_ostream OS(Result);
640 OS << printMBBReference(*BB);
641 OS << " ('" << BB->getName() << "')";
642 OS.flush();
643 return Result;
644}
645#endif
646
647/// Mark a chain's successors as having one fewer preds.
648///
649/// When a chain is being merged into the "placed" chain, this routine will
650/// quickly walk the successors of each block in the chain and mark them as
651/// having one fewer active predecessor. It also adds any successors of this
652/// chain which reach the zero-predecessor state to the appropriate worklist.
653void MachineBlockPlacement::markChainSuccessors(
654 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
655 const BlockFilterSet *BlockFilter) {
656 // Walk all the blocks in this chain, marking their successors as having
657 // a predecessor placed.
658 for (MachineBasicBlock *MBB : Chain) {
659 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
660 }
661}
662
663/// Mark a single block's successors as having one fewer preds.
664///
665/// Under normal circumstances, this is only called by markChainSuccessors,
666/// but if a block that was to be placed is completely tail-duplicated away,
667/// and was duplicated into the chain end, we need to redo markBlockSuccessors
668/// for just that block.
669void MachineBlockPlacement::markBlockSuccessors(
670 const BlockChain &Chain, const MachineBasicBlock *MBB,
671 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
672 // Add any successors for which this is the only un-placed in-loop
673 // predecessor to the worklist as a viable candidate for CFG-neutral
674 // placement. No subsequent placement of this block will violate the CFG
675 // shape, so we get to use heuristics to choose a favorable placement.
676 for (MachineBasicBlock *Succ : MBB->successors()) {
677 if (BlockFilter && !BlockFilter->count(Succ))
678 continue;
679 BlockChain &SuccChain = *BlockToChain[Succ];
680 // Disregard edges within a fixed chain, or edges to the loop header.
681 if (&Chain == &SuccChain || Succ == LoopHeaderBB)
682 continue;
683
684 // This is a cross-chain edge that is within the loop, so decrement the
685 // loop predecessor count of the destination chain.
686 if (SuccChain.UnscheduledPredecessors == 0 ||
687 --SuccChain.UnscheduledPredecessors > 0)
688 continue;
689
690 auto *NewBB = *SuccChain.begin();
691 if (NewBB->isEHPad())
692 EHPadWorkList.push_back(NewBB);
693 else
694 BlockWorkList.push_back(NewBB);
695 }
696}
697
698/// This helper function collects the set of successors of block
699/// \p BB that are allowed to be its layout successors, and return
700/// the total branch probability of edges from \p BB to those
701/// blocks.
702BranchProbability MachineBlockPlacement::collectViableSuccessors(
703 const MachineBasicBlock *BB, const BlockChain &Chain,
704 const BlockFilterSet *BlockFilter,
706 // Adjust edge probabilities by excluding edges pointing to blocks that is
707 // either not in BlockFilter or is already in the current chain. Consider the
708 // following CFG:
709 //
710 // --->A
711 // | / \
712 // | B C
713 // | \ / \
714 // ----D E
715 //
716 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
717 // A->C is chosen as a fall-through, D won't be selected as a successor of C
718 // due to CFG constraint (the probability of C->D is not greater than
719 // HotProb to break topo-order). If we exclude E that is not in BlockFilter
720 // when calculating the probability of C->D, D will be selected and we
721 // will get A C D B as the layout of this loop.
722 auto AdjustedSumProb = BranchProbability::getOne();
723 for (MachineBasicBlock *Succ : BB->successors()) {
724 bool SkipSucc = false;
725 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
726 SkipSucc = true;
727 } else {
728 BlockChain *SuccChain = BlockToChain[Succ];
729 if (SuccChain == &Chain) {
730 SkipSucc = true;
731 } else if (Succ != *SuccChain->begin()) {
732 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
733 << " -> Mid chain!\n");
734 continue;
735 }
736 }
737 if (SkipSucc)
738 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
739 else
740 Successors.push_back(Succ);
741 }
742
743 return AdjustedSumProb;
744}
745
746/// The helper function returns the branch probability that is adjusted
747/// or normalized over the new total \p AdjustedSumProb.
750 BranchProbability AdjustedSumProb) {
751 BranchProbability SuccProb;
752 uint32_t SuccProbN = OrigProb.getNumerator();
753 uint32_t SuccProbD = AdjustedSumProb.getNumerator();
754 if (SuccProbN >= SuccProbD)
755 SuccProb = BranchProbability::getOne();
756 else
757 SuccProb = BranchProbability(SuccProbN, SuccProbD);
758
759 return SuccProb;
760}
761
762/// Check if \p BB has exactly the successors in \p Successors.
763static bool
766 if (BB.succ_size() != Successors.size())
767 return false;
768 // We don't want to count self-loops
769 if (Successors.count(&BB))
770 return false;
771 for (MachineBasicBlock *Succ : BB.successors())
772 if (!Successors.count(Succ))
773 return false;
774 return true;
775}
776
777/// Check if a block should be tail duplicated to increase fallthrough
778/// opportunities.
779/// \p BB Block to check.
780bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
781 // Blocks with single successors don't create additional fallthrough
782 // opportunities. Don't duplicate them. TODO: When conditional exits are
783 // analyzable, allow them to be duplicated.
784 bool IsSimple = TailDup.isSimpleBB(BB);
785
786 if (BB->succ_size() == 1)
787 return false;
788 return TailDup.shouldTailDuplicate(IsSimple, *BB);
789}
790
791/// Compare 2 BlockFrequency's with a small penalty for \p A.
792/// In order to be conservative, we apply a X% penalty to account for
793/// increased icache pressure and static heuristics. For small frequencies
794/// we use only the numerators to improve accuracy. For simplicity, we assume the
795/// penalty is less than 100%
796/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
798 BlockFrequency EntryFreq) {
799 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
800 BlockFrequency Gain = A - B;
801 return (Gain / ThresholdProb) >= EntryFreq;
802}
803
804/// Check the edge frequencies to see if tail duplication will increase
805/// fallthroughs. It only makes sense to call this function when
806/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
807/// always locally profitable if we would have picked \p Succ without
808/// considering duplication.
809bool MachineBlockPlacement::isProfitableToTailDup(
810 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
811 BranchProbability QProb,
812 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
813 // We need to do a probability calculation to make sure this is profitable.
814 // First: does succ have a successor that post-dominates? This affects the
815 // calculation. The 2 relevant cases are:
816 // BB BB
817 // | \Qout | \Qout
818 // P| C |P C
819 // = C' = C'
820 // | /Qin | /Qin
821 // | / | /
822 // Succ Succ
823 // / \ | \ V
824 // U/ =V |U \
825 // / \ = D
826 // D E | /
827 // | /
828 // |/
829 // PDom
830 // '=' : Branch taken for that CFG edge
831 // In the second case, Placing Succ while duplicating it into C prevents the
832 // fallthrough of Succ into either D or PDom, because they now have C as an
833 // unplaced predecessor
834
835 // Start by figuring out which case we fall into
836 MachineBasicBlock *PDom = nullptr;
838 // Only scan the relevant successors
839 auto AdjustedSuccSumProb =
840 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
841 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
842 auto BBFreq = MBFI->getBlockFreq(BB);
843 auto SuccFreq = MBFI->getBlockFreq(Succ);
844 BlockFrequency P = BBFreq * PProb;
845 BlockFrequency Qout = BBFreq * QProb;
846 BlockFrequency EntryFreq = MBFI->getEntryFreq();
847 // If there are no more successors, it is profitable to copy, as it strictly
848 // increases fallthrough.
849 if (SuccSuccs.size() == 0)
850 return greaterWithBias(P, Qout, EntryFreq);
851
852 auto BestSuccSucc = BranchProbability::getZero();
853 // Find the PDom or the best Succ if no PDom exists.
854 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
855 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
856 if (Prob > BestSuccSucc)
857 BestSuccSucc = Prob;
858 if (PDom == nullptr)
859 if (MPDT->dominates(SuccSucc, Succ)) {
860 PDom = SuccSucc;
861 break;
862 }
863 }
864 // For the comparisons, we need to know Succ's best incoming edge that isn't
865 // from BB.
866 auto SuccBestPred = BlockFrequency(0);
867 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
868 if (SuccPred == Succ || SuccPred == BB
869 || BlockToChain[SuccPred] == &Chain
870 || (BlockFilter && !BlockFilter->count(SuccPred)))
871 continue;
872 auto Freq = MBFI->getBlockFreq(SuccPred)
873 * MBPI->getEdgeProbability(SuccPred, Succ);
874 if (Freq > SuccBestPred)
875 SuccBestPred = Freq;
876 }
877 // Qin is Succ's best unplaced incoming edge that isn't BB
878 BlockFrequency Qin = SuccBestPred;
879 // If it doesn't have a post-dominating successor, here is the calculation:
880 // BB BB
881 // | \Qout | \
882 // P| C | =
883 // = C' | C
884 // | /Qin | |
885 // | / | C' (+Succ)
886 // Succ Succ /|
887 // / \ | \/ |
888 // U/ =V | == |
889 // / \ | / \|
890 // D E D E
891 // '=' : Branch taken for that CFG edge
892 // Cost in the first case is: P + V
893 // For this calculation, we always assume P > Qout. If Qout > P
894 // The result of this function will be ignored at the caller.
895 // Let F = SuccFreq - Qin
896 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
897
898 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
899 BranchProbability UProb = BestSuccSucc;
900 BranchProbability VProb = AdjustedSuccSumProb - UProb;
901 BlockFrequency F = SuccFreq - Qin;
902 BlockFrequency V = SuccFreq * VProb;
903 BlockFrequency QinU = std::min(Qin, F) * UProb;
904 BlockFrequency BaseCost = P + V;
905 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
906 return greaterWithBias(BaseCost, DupCost, EntryFreq);
907 }
908 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
909 BranchProbability VProb = AdjustedSuccSumProb - UProb;
910 BlockFrequency U = SuccFreq * UProb;
911 BlockFrequency V = SuccFreq * VProb;
912 BlockFrequency F = SuccFreq - Qin;
913 // If there is a post-dominating successor, here is the calculation:
914 // BB BB BB BB
915 // | \Qout | \ | \Qout | \
916 // |P C | = |P C | =
917 // = C' |P C = C' |P C
918 // | /Qin | | | /Qin | |
919 // | / | C' (+Succ) | / | C' (+Succ)
920 // Succ Succ /| Succ Succ /|
921 // | \ V | \/ | | \ V | \/ |
922 // |U \ |U /\ =? |U = |U /\ |
923 // = D = = =?| | D | = =|
924 // | / |/ D | / |/ D
925 // | / | / | = | /
926 // |/ | / |/ | =
927 // Dom Dom Dom Dom
928 // '=' : Branch taken for that CFG edge
929 // The cost for taken branches in the first case is P + U
930 // Let F = SuccFreq - Qin
931 // The cost in the second case (assuming independence), given the layout:
932 // BB, Succ, (C+Succ), D, Dom or the layout:
933 // BB, Succ, D, Dom, (C+Succ)
934 // is Qout + max(F, Qin) * U + min(F, Qin)
935 // compare P + U vs Qout + P * U + Qin.
936 //
937 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
938 //
939 // For the 3rd case, the cost is P + 2 * V
940 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
941 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
942 if (UProb > AdjustedSuccSumProb / 2 &&
943 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
944 Chain, BlockFilter))
945 // Cases 3 & 4
946 return greaterWithBias(
947 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
948 EntryFreq);
949 // Cases 1 & 2
950 return greaterWithBias((P + U),
951 (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
952 std::max(Qin, F) * UProb),
953 EntryFreq);
954}
955
956/// Check for a trellis layout. \p BB is the upper part of a trellis if its
957/// successors form the lower part of a trellis. A successor set S forms the
958/// lower part of a trellis if all of the predecessors of S are either in S or
959/// have all of S as successors. We ignore trellises where BB doesn't have 2
960/// successors because for fewer than 2, it's trivial, and for 3 or greater they
961/// are very uncommon and complex to compute optimally. Allowing edges within S
962/// is not strictly a trellis, but the same algorithm works, so we allow it.
963bool MachineBlockPlacement::isTrellis(
964 const MachineBasicBlock *BB,
965 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
966 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
967 // Technically BB could form a trellis with branching factor higher than 2.
968 // But that's extremely uncommon.
969 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
970 return false;
971
973 BB->succ_end());
974 // To avoid reviewing the same predecessors twice.
976
977 for (MachineBasicBlock *Succ : ViableSuccs) {
978 int PredCount = 0;
979 for (auto *SuccPred : Succ->predecessors()) {
980 // Allow triangle successors, but don't count them.
981 if (Successors.count(SuccPred)) {
982 // Make sure that it is actually a triangle.
983 for (MachineBasicBlock *CheckSucc : SuccPred->successors())
984 if (!Successors.count(CheckSucc))
985 return false;
986 continue;
987 }
988 const BlockChain *PredChain = BlockToChain[SuccPred];
989 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
990 PredChain == &Chain || PredChain == BlockToChain[Succ])
991 continue;
992 ++PredCount;
993 // Perform the successor check only once.
994 if (!SeenPreds.insert(SuccPred).second)
995 continue;
996 if (!hasSameSuccessors(*SuccPred, Successors))
997 return false;
998 }
999 // If one of the successors has only BB as a predecessor, it is not a
1000 // trellis.
1001 if (PredCount < 1)
1002 return false;
1003 }
1004 return true;
1005}
1006
1007/// Pick the highest total weight pair of edges that can both be laid out.
1008/// The edges in \p Edges[0] are assumed to have a different destination than
1009/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1010/// the individual highest weight edges to the 2 different destinations, or in
1011/// case of a conflict, one of them should be replaced with a 2nd best edge.
1012std::pair<MachineBlockPlacement::WeightedEdge,
1013 MachineBlockPlacement::WeightedEdge>
1014MachineBlockPlacement::getBestNonConflictingEdges(
1015 const MachineBasicBlock *BB,
1017 Edges) {
1018 // Sort the edges, and then for each successor, find the best incoming
1019 // predecessor. If the best incoming predecessors aren't the same,
1020 // then that is clearly the best layout. If there is a conflict, one of the
1021 // successors will have to fallthrough from the second best predecessor. We
1022 // compare which combination is better overall.
1023
1024 // Sort for highest frequency.
1025 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1026
1027 llvm::stable_sort(Edges[0], Cmp);
1028 llvm::stable_sort(Edges[1], Cmp);
1029 auto BestA = Edges[0].begin();
1030 auto BestB = Edges[1].begin();
1031 // Arrange for the correct answer to be in BestA and BestB
1032 // If the 2 best edges don't conflict, the answer is already there.
1033 if (BestA->Src == BestB->Src) {
1034 // Compare the total fallthrough of (Best + Second Best) for both pairs
1035 auto SecondBestA = std::next(BestA);
1036 auto SecondBestB = std::next(BestB);
1037 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1038 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1039 if (BestAScore < BestBScore)
1040 BestA = SecondBestA;
1041 else
1042 BestB = SecondBestB;
1043 }
1044 // Arrange for the BB edge to be in BestA if it exists.
1045 if (BestB->Src == BB)
1046 std::swap(BestA, BestB);
1047 return std::make_pair(*BestA, *BestB);
1048}
1049
1050/// Get the best successor from \p BB based on \p BB being part of a trellis.
1051/// We only handle trellises with 2 successors, so the algorithm is
1052/// straightforward: Find the best pair of edges that don't conflict. We find
1053/// the best incoming edge for each successor in the trellis. If those conflict,
1054/// we consider which of them should be replaced with the second best.
1055/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1056/// comes from \p BB, it will be in \p BestEdges[0]
1057MachineBlockPlacement::BlockAndTailDupResult
1058MachineBlockPlacement::getBestTrellisSuccessor(
1059 const MachineBasicBlock *BB,
1060 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1061 BranchProbability AdjustedSumProb, const BlockChain &Chain,
1062 const BlockFilterSet *BlockFilter) {
1063
1064 BlockAndTailDupResult Result = {nullptr, false};
1066 BB->succ_end());
1067
1068 // We assume size 2 because it's common. For general n, we would have to do
1069 // the Hungarian algorithm, but it's not worth the complexity because more
1070 // than 2 successors is fairly uncommon, and a trellis even more so.
1071 if (Successors.size() != 2 || ViableSuccs.size() != 2)
1072 return Result;
1073
1074 // Collect the edge frequencies of all edges that form the trellis.
1076 int SuccIndex = 0;
1077 for (auto *Succ : ViableSuccs) {
1078 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1079 // Skip any placed predecessors that are not BB
1080 if (SuccPred != BB)
1081 if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1082 BlockToChain[SuccPred] == &Chain ||
1083 BlockToChain[SuccPred] == BlockToChain[Succ])
1084 continue;
1085 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1086 MBPI->getEdgeProbability(SuccPred, Succ);
1087 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1088 }
1089 ++SuccIndex;
1090 }
1091
1092 // Pick the best combination of 2 edges from all the edges in the trellis.
1093 WeightedEdge BestA, BestB;
1094 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1095
1096 if (BestA.Src != BB) {
1097 // If we have a trellis, and BB doesn't have the best fallthrough edges,
1098 // we shouldn't choose any successor. We've already looked and there's a
1099 // better fallthrough edge for all the successors.
1100 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1101 return Result;
1102 }
1103
1104 // Did we pick the triangle edge? If tail-duplication is profitable, do
1105 // that instead. Otherwise merge the triangle edge now while we know it is
1106 // optimal.
1107 if (BestA.Dest == BestB.Src) {
1108 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1109 // would be better.
1110 MachineBasicBlock *Succ1 = BestA.Dest;
1111 MachineBasicBlock *Succ2 = BestB.Dest;
1112 // Check to see if tail-duplication would be profitable.
1113 if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1114 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1115 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1116 Chain, BlockFilter)) {
1118 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1119 dbgs() << " Selected: " << getBlockName(Succ2)
1120 << ", probability: " << Succ2Prob
1121 << " (Tail Duplicate)\n");
1122 Result.BB = Succ2;
1123 Result.ShouldTailDup = true;
1124 return Result;
1125 }
1126 }
1127 // We have already computed the optimal edge for the other side of the
1128 // trellis.
1129 ComputedEdges[BestB.Src] = { BestB.Dest, false };
1130
1131 auto TrellisSucc = BestA.Dest;
1133 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1134 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1135 << ", probability: " << SuccProb << " (Trellis)\n");
1136 Result.BB = TrellisSucc;
1137 return Result;
1138}
1139
1140/// When the option allowTailDupPlacement() is on, this method checks if the
1141/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1142/// into all of its unplaced, unfiltered predecessors, that are not BB.
1143bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1144 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1145 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1146 if (!shouldTailDuplicate(Succ))
1147 return false;
1148
1149 // The result of canTailDuplicate.
1150 bool Duplicate = true;
1151 // Number of possible duplication.
1152 unsigned int NumDup = 0;
1153
1154 // For CFG checking.
1156 BB->succ_end());
1157 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1158 // Make sure all unplaced and unfiltered predecessors can be
1159 // tail-duplicated into.
1160 // Skip any blocks that are already placed or not in this loop.
1161 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1162 || (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
1163 continue;
1164 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1165 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1166 // This will result in a trellis after tail duplication, so we don't
1167 // need to copy Succ into this predecessor. In the presence
1168 // of a trellis tail duplication can continue to be profitable.
1169 // For example:
1170 // A A
1171 // |\ |\
1172 // | \ | \
1173 // | C | C+BB
1174 // | / | |
1175 // |/ | |
1176 // BB => BB |
1177 // |\ |\/|
1178 // | \ |/\|
1179 // | D | D
1180 // | / | /
1181 // |/ |/
1182 // Succ Succ
1183 //
1184 // After BB was duplicated into C, the layout looks like the one on the
1185 // right. BB and C now have the same successors. When considering
1186 // whether Succ can be duplicated into all its unplaced predecessors, we
1187 // ignore C.
1188 // We can do this because C already has a profitable fallthrough, namely
1189 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1190 // duplication and for this test.
1191 //
1192 // This allows trellises to be laid out in 2 separate chains
1193 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1194 // because it allows the creation of 2 fallthrough paths with links
1195 // between them, and we correctly identify the best layout for these
1196 // CFGs. We want to extend trellises that the user created in addition
1197 // to trellises created by tail-duplication, so we just look for the
1198 // CFG.
1199 continue;
1200 Duplicate = false;
1201 continue;
1202 }
1203 NumDup++;
1204 }
1205
1206 // No possible duplication in current filter set.
1207 if (NumDup == 0)
1208 return false;
1209
1210 // If profile information is available, findDuplicateCandidates can do more
1211 // precise benefit analysis.
1212 if (F->getFunction().hasProfileData())
1213 return true;
1214
1215 // This is mainly for function exit BB.
1216 // The integrated tail duplication is really designed for increasing
1217 // fallthrough from predecessors from Succ to its successors. We may need
1218 // other machanism to handle different cases.
1219 if (Succ->succ_empty())
1220 return true;
1221
1222 // Plus the already placed predecessor.
1223 NumDup++;
1224
1225 // If the duplication candidate has more unplaced predecessors than
1226 // successors, the extra duplication can't bring more fallthrough.
1227 //
1228 // Pred1 Pred2 Pred3
1229 // \ | /
1230 // \ | /
1231 // \ | /
1232 // Dup
1233 // / \
1234 // / \
1235 // Succ1 Succ2
1236 //
1237 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1238 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1239 // but the duplication into Pred3 can't increase fallthrough.
1240 //
1241 // A small number of extra duplication may not hurt too much. We need a better
1242 // heuristic to handle it.
1243 if ((NumDup > Succ->succ_size()) || !Duplicate)
1244 return false;
1245
1246 return true;
1247}
1248
1249/// Find chains of triangles where we believe it would be profitable to
1250/// tail-duplicate them all, but a local analysis would not find them.
1251/// There are 3 ways this can be profitable:
1252/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1253/// longer chains)
1254/// 2) The chains are statically correlated. Branch probabilities have a very
1255/// U-shaped distribution.
1256/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1257/// If the branches in a chain are likely to be from the same side of the
1258/// distribution as their predecessor, but are independent at runtime, this
1259/// transformation is profitable. (Because the cost of being wrong is a small
1260/// fixed cost, unlike the standard triangle layout where the cost of being
1261/// wrong scales with the # of triangles.)
1262/// 3) The chains are dynamically correlated. If the probability that a previous
1263/// branch was taken positively influences whether the next branch will be
1264/// taken
1265/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1266void MachineBlockPlacement::precomputeTriangleChains() {
1267 struct TriangleChain {
1268 std::vector<MachineBasicBlock *> Edges;
1269
1270 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1271 : Edges({src, dst}) {}
1272
1273 void append(MachineBasicBlock *dst) {
1274 assert(getKey()->isSuccessor(dst) &&
1275 "Attempting to append a block that is not a successor.");
1276 Edges.push_back(dst);
1277 }
1278
1279 unsigned count() const { return Edges.size() - 1; }
1280
1281 MachineBasicBlock *getKey() const {
1282 return Edges.back();
1283 }
1284 };
1285
1286 if (TriangleChainCount == 0)
1287 return;
1288
1289 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1290 // Map from last block to the chain that contains it. This allows us to extend
1291 // chains as we find new triangles.
1293 for (MachineBasicBlock &BB : *F) {
1294 // If BB doesn't have 2 successors, it doesn't start a triangle.
1295 if (BB.succ_size() != 2)
1296 continue;
1297 MachineBasicBlock *PDom = nullptr;
1298 for (MachineBasicBlock *Succ : BB.successors()) {
1299 if (!MPDT->dominates(Succ, &BB))
1300 continue;
1301 PDom = Succ;
1302 break;
1303 }
1304 // If BB doesn't have a post-dominating successor, it doesn't form a
1305 // triangle.
1306 if (PDom == nullptr)
1307 continue;
1308 // If PDom has a hint that it is low probability, skip this triangle.
1309 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1310 continue;
1311 // If PDom isn't eligible for duplication, this isn't the kind of triangle
1312 // we're looking for.
1313 if (!shouldTailDuplicate(PDom))
1314 continue;
1315 bool CanTailDuplicate = true;
1316 // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1317 // isn't the kind of triangle we're looking for.
1318 for (MachineBasicBlock* Pred : PDom->predecessors()) {
1319 if (Pred == &BB)
1320 continue;
1321 if (!TailDup.canTailDuplicate(PDom, Pred)) {
1322 CanTailDuplicate = false;
1323 break;
1324 }
1325 }
1326 // If we can't tail-duplicate PDom to its predecessors, then skip this
1327 // triangle.
1328 if (!CanTailDuplicate)
1329 continue;
1330
1331 // Now we have an interesting triangle. Insert it if it's not part of an
1332 // existing chain.
1333 // Note: This cannot be replaced with a call insert() or emplace() because
1334 // the find key is BB, but the insert/emplace key is PDom.
1335 auto Found = TriangleChainMap.find(&BB);
1336 // If it is, remove the chain from the map, grow it, and put it back in the
1337 // map with the end as the new key.
1338 if (Found != TriangleChainMap.end()) {
1339 TriangleChain Chain = std::move(Found->second);
1340 TriangleChainMap.erase(Found);
1341 Chain.append(PDom);
1342 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1343 } else {
1344 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1345 assert(InsertResult.second && "Block seen twice.");
1346 (void)InsertResult;
1347 }
1348 }
1349
1350 // Iterating over a DenseMap is safe here, because the only thing in the body
1351 // of the loop is inserting into another DenseMap (ComputedEdges).
1352 // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1353 for (auto &ChainPair : TriangleChainMap) {
1354 TriangleChain &Chain = ChainPair.second;
1355 // Benchmarking has shown that due to branch correlation duplicating 2 or
1356 // more triangles is profitable, despite the calculations assuming
1357 // independence.
1358 if (Chain.count() < TriangleChainCount)
1359 continue;
1360 MachineBasicBlock *dst = Chain.Edges.back();
1361 Chain.Edges.pop_back();
1362 for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1363 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1364 << getBlockName(dst)
1365 << " as pre-computed based on triangles.\n");
1366
1367 auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1368 assert(InsertResult.second && "Block seen twice.");
1369 (void)InsertResult;
1370
1371 dst = src;
1372 }
1373 }
1374}
1375
1376// When profile is not present, return the StaticLikelyProb.
1377// When profile is available, we need to handle the triangle-shape CFG.
1379 const MachineBasicBlock *BB) {
1380 if (!BB->getParent()->getFunction().hasProfileData())
1382 if (BB->succ_size() == 2) {
1383 const MachineBasicBlock *Succ1 = *BB->succ_begin();
1384 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1385 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1386 /* See case 1 below for the cost analysis. For BB->Succ to
1387 * be taken with smaller cost, the following needs to hold:
1388 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1389 * So the threshold T in the calculation below
1390 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1391 * So T / (1 - T) = 2, Yielding T = 2/3
1392 * Also adding user specified branch bias, we have
1393 * T = (2/3)*(ProfileLikelyProb/50)
1394 * = (2*ProfileLikelyProb)/150)
1395 */
1396 return BranchProbability(2 * ProfileLikelyProb, 150);
1397 }
1398 }
1400}
1401
1402/// Checks to see if the layout candidate block \p Succ has a better layout
1403/// predecessor than \c BB. If yes, returns true.
1404/// \p SuccProb: The probability adjusted for only remaining blocks.
1405/// Only used for logging
1406/// \p RealSuccProb: The un-adjusted probability.
1407/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1408/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1409/// considered
1410bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1411 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1412 const BlockChain &SuccChain, BranchProbability SuccProb,
1413 BranchProbability RealSuccProb, const BlockChain &Chain,
1414 const BlockFilterSet *BlockFilter) {
1415
1416 // There isn't a better layout when there are no unscheduled predecessors.
1417 if (SuccChain.UnscheduledPredecessors == 0)
1418 return false;
1419
1420 // There are two basic scenarios here:
1421 // -------------------------------------
1422 // Case 1: triangular shape CFG (if-then):
1423 // BB
1424 // | \
1425 // | \
1426 // | Pred
1427 // | /
1428 // Succ
1429 // In this case, we are evaluating whether to select edge -> Succ, e.g.
1430 // set Succ as the layout successor of BB. Picking Succ as BB's
1431 // successor breaks the CFG constraints (FIXME: define these constraints).
1432 // With this layout, Pred BB
1433 // is forced to be outlined, so the overall cost will be cost of the
1434 // branch taken from BB to Pred, plus the cost of back taken branch
1435 // from Pred to Succ, as well as the additional cost associated
1436 // with the needed unconditional jump instruction from Pred To Succ.
1437
1438 // The cost of the topological order layout is the taken branch cost
1439 // from BB to Succ, so to make BB->Succ a viable candidate, the following
1440 // must hold:
1441 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1442 // < freq(BB->Succ) * taken_branch_cost.
1443 // Ignoring unconditional jump cost, we get
1444 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1445 // prob(BB->Succ) > 2 * prob(BB->Pred)
1446 //
1447 // When real profile data is available, we can precisely compute the
1448 // probability threshold that is needed for edge BB->Succ to be considered.
1449 // Without profile data, the heuristic requires the branch bias to be
1450 // a lot larger to make sure the signal is very strong (e.g. 80% default).
1451 // -----------------------------------------------------------------
1452 // Case 2: diamond like CFG (if-then-else):
1453 // S
1454 // / \
1455 // | \
1456 // BB Pred
1457 // \ /
1458 // Succ
1459 // ..
1460 //
1461 // The current block is BB and edge BB->Succ is now being evaluated.
1462 // Note that edge S->BB was previously already selected because
1463 // prob(S->BB) > prob(S->Pred).
1464 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1465 // choose Pred, we will have a topological ordering as shown on the left
1466 // in the picture below. If we choose Succ, we have the solution as shown
1467 // on the right:
1468 //
1469 // topo-order:
1470 //
1471 // S----- ---S
1472 // | | | |
1473 // ---BB | | BB
1474 // | | | |
1475 // | Pred-- | Succ--
1476 // | | | |
1477 // ---Succ ---Pred--
1478 //
1479 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1480 // = freq(S->Pred) + freq(S->BB)
1481 //
1482 // If we have profile data (i.e, branch probabilities can be trusted), the
1483 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1484 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1485 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1486 // means the cost of topological order is greater.
1487 // When profile data is not available, however, we need to be more
1488 // conservative. If the branch prediction is wrong, breaking the topo-order
1489 // will actually yield a layout with large cost. For this reason, we need
1490 // strong biased branch at block S with Prob(S->BB) in order to select
1491 // BB->Succ. This is equivalent to looking the CFG backward with backward
1492 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1493 // profile data).
1494 // --------------------------------------------------------------------------
1495 // Case 3: forked diamond
1496 // S
1497 // / \
1498 // / \
1499 // BB Pred
1500 // | \ / |
1501 // | \ / |
1502 // | X |
1503 // | / \ |
1504 // | / \ |
1505 // S1 S2
1506 //
1507 // The current block is BB and edge BB->S1 is now being evaluated.
1508 // As above S->BB was already selected because
1509 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1510 //
1511 // topo-order:
1512 //
1513 // S-------| ---S
1514 // | | | |
1515 // ---BB | | BB
1516 // | | | |
1517 // | Pred----| | S1----
1518 // | | | |
1519 // --(S1 or S2) ---Pred--
1520 // |
1521 // S2
1522 //
1523 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1524 // + min(freq(Pred->S1), freq(Pred->S2))
1525 // Non-topo-order cost:
1526 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1527 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1528 // is 0. Then the non topo layout is better when
1529 // freq(S->Pred) < freq(BB->S1).
1530 // This is exactly what is checked below.
1531 // Note there are other shapes that apply (Pred may not be a single block,
1532 // but they all fit this general pattern.)
1534
1535 // Make sure that a hot successor doesn't have a globally more
1536 // important predecessor.
1537 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1538 bool BadCFGConflict = false;
1539
1540 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1541 BlockChain *PredChain = BlockToChain[Pred];
1542 if (Pred == Succ || PredChain == &SuccChain ||
1543 (BlockFilter && !BlockFilter->count(Pred)) ||
1544 PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
1545 // This check is redundant except for look ahead. This function is
1546 // called for lookahead by isProfitableToTailDup when BB hasn't been
1547 // placed yet.
1548 (Pred == BB))
1549 continue;
1550 // Do backward checking.
1551 // For all cases above, we need a backward checking to filter out edges that
1552 // are not 'strongly' biased.
1553 // BB Pred
1554 // \ /
1555 // Succ
1556 // We select edge BB->Succ if
1557 // freq(BB->Succ) > freq(Succ) * HotProb
1558 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1559 // HotProb
1560 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1561 // Case 1 is covered too, because the first equation reduces to:
1562 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1563 BlockFrequency PredEdgeFreq =
1564 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1565 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1566 BadCFGConflict = true;
1567 break;
1568 }
1569 }
1570
1571 if (BadCFGConflict) {
1572 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1573 << SuccProb << " (prob) (non-cold CFG conflict)\n");
1574 return true;
1575 }
1576
1577 return false;
1578}
1579
1580/// Select the best successor for a block.
1581///
1582/// This looks across all successors of a particular block and attempts to
1583/// select the "best" one to be the layout successor. It only considers direct
1584/// successors which also pass the block filter. It will attempt to avoid
1585/// breaking CFG structure, but cave and break such structures in the case of
1586/// very hot successor edges.
1587///
1588/// \returns The best successor block found, or null if none are viable, along
1589/// with a boolean indicating if tail duplication is necessary.
1590MachineBlockPlacement::BlockAndTailDupResult
1591MachineBlockPlacement::selectBestSuccessor(
1592 const MachineBasicBlock *BB, const BlockChain &Chain,
1593 const BlockFilterSet *BlockFilter) {
1594 const BranchProbability HotProb(StaticLikelyProb, 100);
1595
1596 BlockAndTailDupResult BestSucc = { nullptr, false };
1597 auto BestProb = BranchProbability::getZero();
1598
1600 auto AdjustedSumProb =
1601 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1602
1603 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1604 << "\n");
1605
1606 // if we already precomputed the best successor for BB, return that if still
1607 // applicable.
1608 auto FoundEdge = ComputedEdges.find(BB);
1609 if (FoundEdge != ComputedEdges.end()) {
1610 MachineBasicBlock *Succ = FoundEdge->second.BB;
1611 ComputedEdges.erase(FoundEdge);
1612 BlockChain *SuccChain = BlockToChain[Succ];
1613 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1614 SuccChain != &Chain && Succ == *SuccChain->begin())
1615 return FoundEdge->second;
1616 }
1617
1618 // if BB is part of a trellis, Use the trellis to determine the optimal
1619 // fallthrough edges
1620 if (isTrellis(BB, Successors, Chain, BlockFilter))
1621 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1622 BlockFilter);
1623
1624 // For blocks with CFG violations, we may be able to lay them out anyway with
1625 // tail-duplication. We keep this vector so we can perform the probability
1626 // calculations the minimum number of times.
1628 DupCandidates;
1629 for (MachineBasicBlock *Succ : Successors) {
1630 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1631 BranchProbability SuccProb =
1632 getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1633
1634 BlockChain &SuccChain = *BlockToChain[Succ];
1635 // Skip the edge \c BB->Succ if block \c Succ has a better layout
1636 // predecessor that yields lower global cost.
1637 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1638 Chain, BlockFilter)) {
1639 // If tail duplication would make Succ profitable, place it.
1640 if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1641 DupCandidates.emplace_back(SuccProb, Succ);
1642 continue;
1643 }
1644
1645 LLVM_DEBUG(
1646 dbgs() << " Candidate: " << getBlockName(Succ)
1647 << ", probability: " << SuccProb
1648 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1649 << "\n");
1650
1651 if (BestSucc.BB && BestProb >= SuccProb) {
1652 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1653 continue;
1654 }
1655
1656 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1657 BestSucc.BB = Succ;
1658 BestProb = SuccProb;
1659 }
1660 // Handle the tail duplication candidates in order of decreasing probability.
1661 // Stop at the first one that is profitable. Also stop if they are less
1662 // profitable than BestSucc. Position is important because we preserve it and
1663 // prefer first best match. Here we aren't comparing in order, so we capture
1664 // the position instead.
1665 llvm::stable_sort(DupCandidates,
1666 [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1667 std::tuple<BranchProbability, MachineBasicBlock *> R) {
1668 return std::get<0>(L) > std::get<0>(R);
1669 });
1670 for (auto &Tup : DupCandidates) {
1671 BranchProbability DupProb;
1672 MachineBasicBlock *Succ;
1673 std::tie(DupProb, Succ) = Tup;
1674 if (DupProb < BestProb)
1675 break;
1676 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1677 && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1678 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1679 << ", probability: " << DupProb
1680 << " (Tail Duplicate)\n");
1681 BestSucc.BB = Succ;
1682 BestSucc.ShouldTailDup = true;
1683 break;
1684 }
1685 }
1686
1687 if (BestSucc.BB)
1688 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1689
1690 return BestSucc;
1691}
1692
1693/// Select the best block from a worklist.
1694///
1695/// This looks through the provided worklist as a list of candidate basic
1696/// blocks and select the most profitable one to place. The definition of
1697/// profitable only really makes sense in the context of a loop. This returns
1698/// the most frequently visited block in the worklist, which in the case of
1699/// a loop, is the one most desirable to be physically close to the rest of the
1700/// loop body in order to improve i-cache behavior.
1701///
1702/// \returns The best block found, or null if none are viable.
1703MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1704 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1705 // Once we need to walk the worklist looking for a candidate, cleanup the
1706 // worklist of already placed entries.
1707 // FIXME: If this shows up on profiles, it could be folded (at the cost of
1708 // some code complexity) into the loop below.
1709 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1710 return BlockToChain.lookup(BB) == &Chain;
1711 });
1712
1713 if (WorkList.empty())
1714 return nullptr;
1715
1716 bool IsEHPad = WorkList[0]->isEHPad();
1717
1718 MachineBasicBlock *BestBlock = nullptr;
1719 BlockFrequency BestFreq;
1720 for (MachineBasicBlock *MBB : WorkList) {
1721 assert(MBB->isEHPad() == IsEHPad &&
1722 "EHPad mismatch between block and work list.");
1723
1724 BlockChain &SuccChain = *BlockToChain[MBB];
1725 if (&SuccChain == &Chain)
1726 continue;
1727
1728 assert(SuccChain.UnscheduledPredecessors == 0 &&
1729 "Found CFG-violating block");
1730
1731 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1732 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1733 << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1734 << " (freq)\n");
1735
1736 // For ehpad, we layout the least probable first as to avoid jumping back
1737 // from least probable landingpads to more probable ones.
1738 //
1739 // FIXME: Using probability is probably (!) not the best way to achieve
1740 // this. We should probably have a more principled approach to layout
1741 // cleanup code.
1742 //
1743 // The goal is to get:
1744 //
1745 // +--------------------------+
1746 // | V
1747 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1748 //
1749 // Rather than:
1750 //
1751 // +-------------------------------------+
1752 // V |
1753 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1754 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1755 continue;
1756
1757 BestBlock = MBB;
1758 BestFreq = CandidateFreq;
1759 }
1760
1761 return BestBlock;
1762}
1763
1764/// Retrieve the first unplaced basic block.
1765///
1766/// This routine is called when we are unable to use the CFG to walk through
1767/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1768/// We walk through the function's blocks in order, starting from the
1769/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1770/// re-scanning the entire sequence on repeated calls to this routine.
1771MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1772 const BlockChain &PlacedChain,
1773 MachineFunction::iterator &PrevUnplacedBlockIt,
1774 const BlockFilterSet *BlockFilter) {
1775 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1776 ++I) {
1777 if (BlockFilter && !BlockFilter->count(&*I))
1778 continue;
1779 if (BlockToChain[&*I] != &PlacedChain) {
1780 PrevUnplacedBlockIt = I;
1781 // Now select the head of the chain to which the unplaced block belongs
1782 // as the block to place. This will force the entire chain to be placed,
1783 // and satisfies the requirements of merging chains.
1784 return *BlockToChain[&*I]->begin();
1785 }
1786 }
1787 return nullptr;
1788}
1789
1790void MachineBlockPlacement::fillWorkLists(
1791 const MachineBasicBlock *MBB,
1792 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1793 const BlockFilterSet *BlockFilter = nullptr) {
1794 BlockChain &Chain = *BlockToChain[MBB];
1795 if (!UpdatedPreds.insert(&Chain).second)
1796 return;
1797
1798 assert(
1799 Chain.UnscheduledPredecessors == 0 &&
1800 "Attempting to place block with unscheduled predecessors in worklist.");
1801 for (MachineBasicBlock *ChainBB : Chain) {
1802 assert(BlockToChain[ChainBB] == &Chain &&
1803 "Block in chain doesn't match BlockToChain map.");
1804 for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1805 if (BlockFilter && !BlockFilter->count(Pred))
1806 continue;
1807 if (BlockToChain[Pred] == &Chain)
1808 continue;
1809 ++Chain.UnscheduledPredecessors;
1810 }
1811 }
1812
1813 if (Chain.UnscheduledPredecessors != 0)
1814 return;
1815
1816 MachineBasicBlock *BB = *Chain.begin();
1817 if (BB->isEHPad())
1818 EHPadWorkList.push_back(BB);
1819 else
1820 BlockWorkList.push_back(BB);
1821}
1822
1823void MachineBlockPlacement::buildChain(
1824 const MachineBasicBlock *HeadBB, BlockChain &Chain,
1825 BlockFilterSet *BlockFilter) {
1826 assert(HeadBB && "BB must not be null.\n");
1827 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1828 MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1829
1830 const MachineBasicBlock *LoopHeaderBB = HeadBB;
1831 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1832 MachineBasicBlock *BB = *std::prev(Chain.end());
1833 while (true) {
1834 assert(BB && "null block found at end of chain in loop.");
1835 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1836 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1837
1838
1839 // Look for the best viable successor if there is one to place immediately
1840 // after this block.
1841 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1842 MachineBasicBlock* BestSucc = Result.BB;
1843 bool ShouldTailDup = Result.ShouldTailDup;
1844 if (allowTailDupPlacement())
1845 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc,
1846 Chain,
1847 BlockFilter));
1848
1849 // If an immediate successor isn't available, look for the best viable
1850 // block among those we've identified as not violating the loop's CFG at
1851 // this point. This won't be a fallthrough, but it will increase locality.
1852 if (!BestSucc)
1853 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1854 if (!BestSucc)
1855 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1856
1857 if (!BestSucc) {
1858 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1859 if (!BestSucc)
1860 break;
1861
1862 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1863 "layout successor until the CFG reduces\n");
1864 }
1865
1866 // Placement may have changed tail duplication opportunities.
1867 // Check for that now.
1868 if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1869 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1870 BlockFilter, PrevUnplacedBlockIt);
1871 // If the chosen successor was duplicated into BB, don't bother laying
1872 // it out, just go round the loop again with BB as the chain end.
1873 if (!BB->isSuccessor(BestSucc))
1874 continue;
1875 }
1876
1877 // Place this block, updating the datastructures to reflect its placement.
1878 BlockChain &SuccChain = *BlockToChain[BestSucc];
1879 // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1880 // we selected a successor that didn't fit naturally into the CFG.
1881 SuccChain.UnscheduledPredecessors = 0;
1882 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1883 << getBlockName(BestSucc) << "\n");
1884 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1885 Chain.merge(BestSucc, &SuccChain);
1886 BB = *std::prev(Chain.end());
1887 }
1888
1889 LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1890 << getBlockName(*Chain.begin()) << "\n");
1891}
1892
1893// If bottom of block BB has only one successor OldTop, in most cases it is
1894// profitable to move it before OldTop, except the following case:
1895//
1896// -->OldTop<-
1897// | . |
1898// | . |
1899// | . |
1900// ---Pred |
1901// | |
1902// BB-----
1903//
1904// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1905// layout the other successor below it, so it can't reduce taken branch.
1906// In this case we keep its original layout.
1907bool
1908MachineBlockPlacement::canMoveBottomBlockToTop(
1909 const MachineBasicBlock *BottomBlock,
1910 const MachineBasicBlock *OldTop) {
1911 if (BottomBlock->pred_size() != 1)
1912 return true;
1913 MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1914 if (Pred->succ_size() != 2)
1915 return true;
1916
1917 MachineBasicBlock *OtherBB = *Pred->succ_begin();
1918 if (OtherBB == BottomBlock)
1919 OtherBB = *Pred->succ_rbegin();
1920 if (OtherBB == OldTop)
1921 return false;
1922
1923 return true;
1924}
1925
1926// Find out the possible fall through frequence to the top of a loop.
1928MachineBlockPlacement::TopFallThroughFreq(
1929 const MachineBasicBlock *Top,
1930 const BlockFilterSet &LoopBlockSet) {
1931 BlockFrequency MaxFreq = BlockFrequency(0);
1932 for (MachineBasicBlock *Pred : Top->predecessors()) {
1933 BlockChain *PredChain = BlockToChain[Pred];
1934 if (!LoopBlockSet.count(Pred) &&
1935 (!PredChain || Pred == *std::prev(PredChain->end()))) {
1936 // Found a Pred block can be placed before Top.
1937 // Check if Top is the best successor of Pred.
1938 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1939 bool TopOK = true;
1940 for (MachineBasicBlock *Succ : Pred->successors()) {
1941 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1942 BlockChain *SuccChain = BlockToChain[Succ];
1943 // Check if Succ can be placed after Pred.
1944 // Succ should not be in any chain, or it is the head of some chain.
1945 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1946 (!SuccChain || Succ == *SuccChain->begin())) {
1947 TopOK = false;
1948 break;
1949 }
1950 }
1951 if (TopOK) {
1952 BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1953 MBPI->getEdgeProbability(Pred, Top);
1954 if (EdgeFreq > MaxFreq)
1955 MaxFreq = EdgeFreq;
1956 }
1957 }
1958 }
1959 return MaxFreq;
1960}
1961
1962// Compute the fall through gains when move NewTop before OldTop.
1963//
1964// In following diagram, edges marked as "-" are reduced fallthrough, edges
1965// marked as "+" are increased fallthrough, this function computes
1966//
1967// SUM(increased fallthrough) - SUM(decreased fallthrough)
1968//
1969// |
1970// | -
1971// V
1972// --->OldTop
1973// | .
1974// | .
1975// +| . +
1976// | Pred --->
1977// | |-
1978// | V
1979// --- NewTop <---
1980// |-
1981// V
1982//
1984MachineBlockPlacement::FallThroughGains(
1985 const MachineBasicBlock *NewTop,
1986 const MachineBasicBlock *OldTop,
1987 const MachineBasicBlock *ExitBB,
1988 const BlockFilterSet &LoopBlockSet) {
1989 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
1990 BlockFrequency FallThrough2Exit = BlockFrequency(0);
1991 if (ExitBB)
1992 FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
1993 MBPI->getEdgeProbability(NewTop, ExitBB);
1994 BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
1995 MBPI->getEdgeProbability(NewTop, OldTop);
1996
1997 // Find the best Pred of NewTop.
1998 MachineBasicBlock *BestPred = nullptr;
1999 BlockFrequency FallThroughFromPred = BlockFrequency(0);
2000 for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2001 if (!LoopBlockSet.count(Pred))
2002 continue;
2003 BlockChain *PredChain = BlockToChain[Pred];
2004 if (!PredChain || Pred == *std::prev(PredChain->end())) {
2005 BlockFrequency EdgeFreq =
2006 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop);
2007 if (EdgeFreq > FallThroughFromPred) {
2008 FallThroughFromPred = EdgeFreq;
2009 BestPred = Pred;
2010 }
2011 }
2012 }
2013
2014 // If NewTop is not placed after Pred, another successor can be placed
2015 // after Pred.
2016 BlockFrequency NewFreq = BlockFrequency(0);
2017 if (BestPred) {
2018 for (MachineBasicBlock *Succ : BestPred->successors()) {
2019 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2020 continue;
2021 if (ComputedEdges.contains(Succ))
2022 continue;
2023 BlockChain *SuccChain = BlockToChain[Succ];
2024 if ((SuccChain && (Succ != *SuccChain->begin())) ||
2025 (SuccChain == BlockToChain[BestPred]))
2026 continue;
2027 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2028 MBPI->getEdgeProbability(BestPred, Succ);
2029 if (EdgeFreq > NewFreq)
2030 NewFreq = EdgeFreq;
2031 }
2032 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2033 MBPI->getEdgeProbability(BestPred, NewTop);
2034 if (NewFreq > OrigEdgeFreq) {
2035 // If NewTop is not the best successor of Pred, then Pred doesn't
2036 // fallthrough to NewTop. So there is no FallThroughFromPred and
2037 // NewFreq.
2038 NewFreq = BlockFrequency(0);
2039 FallThroughFromPred = BlockFrequency(0);
2040 }
2041 }
2042
2044 BlockFrequency Gains = BackEdgeFreq + NewFreq;
2045 BlockFrequency Lost =
2046 FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2047 if (Gains > Lost)
2048 Result = Gains - Lost;
2049 return Result;
2050}
2051
2052/// Helper function of findBestLoopTop. Find the best loop top block
2053/// from predecessors of old top.
2054///
2055/// Look for a block which is strictly better than the old top for laying
2056/// out before the old top of the loop. This looks for only two patterns:
2057///
2058/// 1. a block has only one successor, the old loop top
2059///
2060/// Because such a block will always result in an unconditional jump,
2061/// rotating it in front of the old top is always profitable.
2062///
2063/// 2. a block has two successors, one is old top, another is exit
2064/// and it has more than one predecessors
2065///
2066/// If it is below one of its predecessors P, only P can fall through to
2067/// it, all other predecessors need a jump to it, and another conditional
2068/// jump to loop header. If it is moved before loop header, all its
2069/// predecessors jump to it, then fall through to loop header. So all its
2070/// predecessors except P can reduce one taken branch.
2071/// At the same time, move it before old top increases the taken branch
2072/// to loop exit block, so the reduced taken branch will be compared with
2073/// the increased taken branch to the loop exit block.
2075MachineBlockPlacement::findBestLoopTopHelper(
2076 MachineBasicBlock *OldTop,
2077 const MachineLoop &L,
2078 const BlockFilterSet &LoopBlockSet) {
2079 // Check that the header hasn't been fused with a preheader block due to
2080 // crazy branches. If it has, we need to start with the header at the top to
2081 // prevent pulling the preheader into the loop body.
2082 BlockChain &HeaderChain = *BlockToChain[OldTop];
2083 if (!LoopBlockSet.count(*HeaderChain.begin()))
2084 return OldTop;
2085 if (OldTop != *HeaderChain.begin())
2086 return OldTop;
2087
2088 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2089 << "\n");
2090
2091 BlockFrequency BestGains = BlockFrequency(0);
2092 MachineBasicBlock *BestPred = nullptr;
2093 for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2094 if (!LoopBlockSet.count(Pred))
2095 continue;
2096 if (Pred == L.getHeader())
2097 continue;
2098 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2099 << Pred->succ_size() << " successors, "
2100 << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2101 if (Pred->succ_size() > 2)
2102 continue;
2103
2104 MachineBasicBlock *OtherBB = nullptr;
2105 if (Pred->succ_size() == 2) {
2106 OtherBB = *Pred->succ_begin();
2107 if (OtherBB == OldTop)
2108 OtherBB = *Pred->succ_rbegin();
2109 }
2110
2111 if (!canMoveBottomBlockToTop(Pred, OldTop))
2112 continue;
2113
2114 BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
2115 LoopBlockSet);
2116 if ((Gains > BlockFrequency(0)) &&
2117 (Gains > BestGains ||
2118 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2119 BestPred = Pred;
2120 BestGains = Gains;
2121 }
2122 }
2123
2124 // If no direct predecessor is fine, just use the loop header.
2125 if (!BestPred) {
2126 LLVM_DEBUG(dbgs() << " final top unchanged\n");
2127 return OldTop;
2128 }
2129
2130 // Walk backwards through any straight line of predecessors.
2131 while (BestPred->pred_size() == 1 &&
2132 (*BestPred->pred_begin())->succ_size() == 1 &&
2133 *BestPred->pred_begin() != L.getHeader())
2134 BestPred = *BestPred->pred_begin();
2135
2136 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2137 return BestPred;
2138}
2139
2140/// Find the best loop top block for layout.
2141///
2142/// This function iteratively calls findBestLoopTopHelper, until no new better
2143/// BB can be found.
2145MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2146 const BlockFilterSet &LoopBlockSet) {
2147 // Placing the latch block before the header may introduce an extra branch
2148 // that skips this block the first time the loop is executed, which we want
2149 // to avoid when optimising for size.
2150 // FIXME: in theory there is a case that does not introduce a new branch,
2151 // i.e. when the layout predecessor does not fallthrough to the loop header.
2152 // In practice this never happens though: there always seems to be a preheader
2153 // that can fallthrough and that is also placed before the header.
2154 bool OptForSize = F->getFunction().hasOptSize() ||
2155 llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());
2156 if (OptForSize)
2157 return L.getHeader();
2158
2159 MachineBasicBlock *OldTop = nullptr;
2160 MachineBasicBlock *NewTop = L.getHeader();
2161 while (NewTop != OldTop) {
2162 OldTop = NewTop;
2163 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2164 if (NewTop != OldTop)
2165 ComputedEdges[NewTop] = { OldTop, false };
2166 }
2167 return NewTop;
2168}
2169
2170/// Find the best loop exiting block for layout.
2171///
2172/// This routine implements the logic to analyze the loop looking for the best
2173/// block to layout at the top of the loop. Typically this is done to maximize
2174/// fallthrough opportunities.
2176MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2177 const BlockFilterSet &LoopBlockSet,
2178 BlockFrequency &ExitFreq) {
2179 // We don't want to layout the loop linearly in all cases. If the loop header
2180 // is just a normal basic block in the loop, we want to look for what block
2181 // within the loop is the best one to layout at the top. However, if the loop
2182 // header has be pre-merged into a chain due to predecessors not having
2183 // analyzable branches, *and* the predecessor it is merged with is *not* part
2184 // of the loop, rotating the header into the middle of the loop will create
2185 // a non-contiguous range of blocks which is Very Bad. So start with the
2186 // header and only rotate if safe.
2187 BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2188 if (!LoopBlockSet.count(*HeaderChain.begin()))
2189 return nullptr;
2190
2191 BlockFrequency BestExitEdgeFreq;
2192 unsigned BestExitLoopDepth = 0;
2193 MachineBasicBlock *ExitingBB = nullptr;
2194 // If there are exits to outer loops, loop rotation can severely limit
2195 // fallthrough opportunities unless it selects such an exit. Keep a set of
2196 // blocks where rotating to exit with that block will reach an outer loop.
2197 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2198
2199 LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2200 << getBlockName(L.getHeader()) << "\n");
2201 for (MachineBasicBlock *MBB : L.getBlocks()) {
2202 BlockChain &Chain = *BlockToChain[MBB];
2203 // Ensure that this block is at the end of a chain; otherwise it could be
2204 // mid-way through an inner loop or a successor of an unanalyzable branch.
2205 if (MBB != *std::prev(Chain.end()))
2206 continue;
2207
2208 // Now walk the successors. We need to establish whether this has a viable
2209 // exiting successor and whether it has a viable non-exiting successor.
2210 // We store the old exiting state and restore it if a viable looping
2211 // successor isn't found.
2212 MachineBasicBlock *OldExitingBB = ExitingBB;
2213 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2214 bool HasLoopingSucc = false;
2215 for (MachineBasicBlock *Succ : MBB->successors()) {
2216 if (Succ->isEHPad())
2217 continue;
2218 if (Succ == MBB)
2219 continue;
2220 BlockChain &SuccChain = *BlockToChain[Succ];
2221 // Don't split chains, either this chain or the successor's chain.
2222 if (&Chain == &SuccChain) {
2223 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2224 << getBlockName(Succ) << " (chain conflict)\n");
2225 continue;
2226 }
2227
2228 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2229 if (LoopBlockSet.count(Succ)) {
2230 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2231 << getBlockName(Succ) << " (" << SuccProb << ")\n");
2232 HasLoopingSucc = true;
2233 continue;
2234 }
2235
2236 unsigned SuccLoopDepth = 0;
2237 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2238 SuccLoopDepth = ExitLoop->getLoopDepth();
2239 if (ExitLoop->contains(&L))
2240 BlocksExitingToOuterLoop.insert(MBB);
2241 }
2242
2243 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2244 LLVM_DEBUG(
2245 dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2246 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2247 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2248 // Note that we bias this toward an existing layout successor to retain
2249 // incoming order in the absence of better information. The exit must have
2250 // a frequency higher than the current exit before we consider breaking
2251 // the layout.
2252 BranchProbability Bias(100 - ExitBlockBias, 100);
2253 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2254 ExitEdgeFreq > BestExitEdgeFreq ||
2255 (MBB->isLayoutSuccessor(Succ) &&
2256 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2257 BestExitEdgeFreq = ExitEdgeFreq;
2258 ExitingBB = MBB;
2259 }
2260 }
2261
2262 if (!HasLoopingSucc) {
2263 // Restore the old exiting state, no viable looping successor was found.
2264 ExitingBB = OldExitingBB;
2265 BestExitEdgeFreq = OldBestExitEdgeFreq;
2266 }
2267 }
2268 // Without a candidate exiting block or with only a single block in the
2269 // loop, just use the loop header to layout the loop.
2270 if (!ExitingBB) {
2271 LLVM_DEBUG(
2272 dbgs() << " No other candidate exit blocks, using loop header\n");
2273 return nullptr;
2274 }
2275 if (L.getNumBlocks() == 1) {
2276 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2277 return nullptr;
2278 }
2279
2280 // Also, if we have exit blocks which lead to outer loops but didn't select
2281 // one of them as the exiting block we are rotating toward, disable loop
2282 // rotation altogether.
2283 if (!BlocksExitingToOuterLoop.empty() &&
2284 !BlocksExitingToOuterLoop.count(ExitingBB))
2285 return nullptr;
2286
2287 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2288 << "\n");
2289 ExitFreq = BestExitEdgeFreq;
2290 return ExitingBB;
2291}
2292
2293/// Check if there is a fallthrough to loop header Top.
2294///
2295/// 1. Look for a Pred that can be layout before Top.
2296/// 2. Check if Top is the most possible successor of Pred.
2297bool
2298MachineBlockPlacement::hasViableTopFallthrough(
2299 const MachineBasicBlock *Top,
2300 const BlockFilterSet &LoopBlockSet) {
2301 for (MachineBasicBlock *Pred : Top->predecessors()) {
2302 BlockChain *PredChain = BlockToChain[Pred];
2303 if (!LoopBlockSet.count(Pred) &&
2304 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2305 // Found a Pred block can be placed before Top.
2306 // Check if Top is the best successor of Pred.
2307 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2308 bool TopOK = true;
2309 for (MachineBasicBlock *Succ : Pred->successors()) {
2310 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2311 BlockChain *SuccChain = BlockToChain[Succ];
2312 // Check if Succ can be placed after Pred.
2313 // Succ should not be in any chain, or it is the head of some chain.
2314 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2315 TopOK = false;
2316 break;
2317 }
2318 }
2319 if (TopOK)
2320 return true;
2321 }
2322 }
2323 return false;
2324}
2325
2326/// Attempt to rotate an exiting block to the bottom of the loop.
2327///
2328/// Once we have built a chain, try to rotate it to line up the hot exit block
2329/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2330/// branches. For example, if the loop has fallthrough into its header and out
2331/// of its bottom already, don't rotate it.
2332void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2333 const MachineBasicBlock *ExitingBB,
2334 BlockFrequency ExitFreq,
2335 const BlockFilterSet &LoopBlockSet) {
2336 if (!ExitingBB)
2337 return;
2338
2339 MachineBasicBlock *Top = *LoopChain.begin();
2340 MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2341
2342 // If ExitingBB is already the last one in a chain then nothing to do.
2343 if (Bottom == ExitingBB)
2344 return;
2345
2346 // The entry block should always be the first BB in a function.
2347 if (Top->isEntryBlock())
2348 return;
2349
2350 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2351
2352 // If the header has viable fallthrough, check whether the current loop
2353 // bottom is a viable exiting block. If so, bail out as rotating will
2354 // introduce an unnecessary branch.
2355 if (ViableTopFallthrough) {
2356 for (MachineBasicBlock *Succ : Bottom->successors()) {
2357 BlockChain *SuccChain = BlockToChain[Succ];
2358 if (!LoopBlockSet.count(Succ) &&
2359 (!SuccChain || Succ == *SuccChain->begin()))
2360 return;
2361 }
2362
2363 // Rotate will destroy the top fallthrough, we need to ensure the new exit
2364 // frequency is larger than top fallthrough.
2365 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2366 if (FallThrough2Top >= ExitFreq)
2367 return;
2368 }
2369
2370 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2371 if (ExitIt == LoopChain.end())
2372 return;
2373
2374 // Rotating a loop exit to the bottom when there is a fallthrough to top
2375 // trades the entry fallthrough for an exit fallthrough.
2376 // If there is no bottom->top edge, but the chosen exit block does have
2377 // a fallthrough, we break that fallthrough for nothing in return.
2378
2379 // Let's consider an example. We have a built chain of basic blocks
2380 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2381 // By doing a rotation we get
2382 // Bk+1, ..., Bn, B1, ..., Bk
2383 // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2384 // If we had a fallthrough Bk -> Bk+1 it is broken now.
2385 // It might be compensated by fallthrough Bn -> B1.
2386 // So we have a condition to avoid creation of extra branch by loop rotation.
2387 // All below must be true to avoid loop rotation:
2388 // If there is a fallthrough to top (B1)
2389 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2390 // There is no fallthrough from bottom (Bn) to top (B1).
2391 // Please note that there is no exit fallthrough from Bn because we checked it
2392 // above.
2393 if (ViableTopFallthrough) {
2394 assert(std::next(ExitIt) != LoopChain.end() &&
2395 "Exit should not be last BB");
2396 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2397 if (ExitingBB->isSuccessor(NextBlockInChain))
2398 if (!Bottom->isSuccessor(Top))
2399 return;
2400 }
2401
2402 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2403 << " at bottom\n");
2404 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2405}
2406
2407/// Attempt to rotate a loop based on profile data to reduce branch cost.
2408///
2409/// With profile data, we can determine the cost in terms of missed fall through
2410/// opportunities when rotating a loop chain and select the best rotation.
2411/// Basically, there are three kinds of cost to consider for each rotation:
2412/// 1. The possibly missed fall through edge (if it exists) from BB out of
2413/// the loop to the loop header.
2414/// 2. The possibly missed fall through edges (if they exist) from the loop
2415/// exits to BB out of the loop.
2416/// 3. The missed fall through edge (if it exists) from the last BB to the
2417/// first BB in the loop chain.
2418/// Therefore, the cost for a given rotation is the sum of costs listed above.
2419/// We select the best rotation with the smallest cost.
2420void MachineBlockPlacement::rotateLoopWithProfile(
2421 BlockChain &LoopChain, const MachineLoop &L,
2422 const BlockFilterSet &LoopBlockSet) {
2423 auto RotationPos = LoopChain.end();
2424 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2425
2426 // The entry block should always be the first BB in a function.
2427 if (ChainHeaderBB->isEntryBlock())
2428 return;
2429
2430 BlockFrequency SmallestRotationCost = BlockFrequency::max();
2431
2432 // A utility lambda that scales up a block frequency by dividing it by a
2433 // branch probability which is the reciprocal of the scale.
2434 auto ScaleBlockFrequency = [](BlockFrequency Freq,
2435 unsigned Scale) -> BlockFrequency {
2436 if (Scale == 0)
2437 return BlockFrequency(0);
2438 // Use operator / between BlockFrequency and BranchProbability to implement
2439 // saturating multiplication.
2440 return Freq / BranchProbability(1, Scale);
2441 };
2442
2443 // Compute the cost of the missed fall-through edge to the loop header if the
2444 // chain head is not the loop header. As we only consider natural loops with
2445 // single header, this computation can be done only once.
2446 BlockFrequency HeaderFallThroughCost(0);
2447 for (auto *Pred : ChainHeaderBB->predecessors()) {
2448 BlockChain *PredChain = BlockToChain[Pred];
2449 if (!LoopBlockSet.count(Pred) &&
2450 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2451 auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2452 MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2453 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2454 // If the predecessor has only an unconditional jump to the header, we
2455 // need to consider the cost of this jump.
2456 if (Pred->succ_size() == 1)
2457 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2458 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2459 }
2460 }
2461
2462 // Here we collect all exit blocks in the loop, and for each exit we find out
2463 // its hottest exit edge. For each loop rotation, we define the loop exit cost
2464 // as the sum of frequencies of exit edges we collect here, excluding the exit
2465 // edge from the tail of the loop chain.
2467 for (auto *BB : LoopChain) {
2468 auto LargestExitEdgeProb = BranchProbability::getZero();
2469 for (auto *Succ : BB->successors()) {
2470 BlockChain *SuccChain = BlockToChain[Succ];
2471 if (!LoopBlockSet.count(Succ) &&
2472 (!SuccChain || Succ == *SuccChain->begin())) {
2473 auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2474 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2475 }
2476 }
2477 if (LargestExitEdgeProb > BranchProbability::getZero()) {
2478 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2479 ExitsWithFreq.emplace_back(BB, ExitFreq);
2480 }
2481 }
2482
2483 // In this loop we iterate every block in the loop chain and calculate the
2484 // cost assuming the block is the head of the loop chain. When the loop ends,
2485 // we should have found the best candidate as the loop chain's head.
2486 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2487 EndIter = LoopChain.end();
2488 Iter != EndIter; Iter++, TailIter++) {
2489 // TailIter is used to track the tail of the loop chain if the block we are
2490 // checking (pointed by Iter) is the head of the chain.
2491 if (TailIter == LoopChain.end())
2492 TailIter = LoopChain.begin();
2493
2494 auto TailBB = *TailIter;
2495
2496 // Calculate the cost by putting this BB to the top.
2498
2499 // If the current BB is the loop header, we need to take into account the
2500 // cost of the missed fall through edge from outside of the loop to the
2501 // header.
2502 if (Iter != LoopChain.begin())
2503 Cost += HeaderFallThroughCost;
2504
2505 // Collect the loop exit cost by summing up frequencies of all exit edges
2506 // except the one from the chain tail.
2507 for (auto &ExitWithFreq : ExitsWithFreq)
2508 if (TailBB != ExitWithFreq.first)
2509 Cost += ExitWithFreq.second;
2510
2511 // The cost of breaking the once fall-through edge from the tail to the top
2512 // of the loop chain. Here we need to consider three cases:
2513 // 1. If the tail node has only one successor, then we will get an
2514 // additional jmp instruction. So the cost here is (MisfetchCost +
2515 // JumpInstCost) * tail node frequency.
2516 // 2. If the tail node has two successors, then we may still get an
2517 // additional jmp instruction if the layout successor after the loop
2518 // chain is not its CFG successor. Note that the more frequently executed
2519 // jmp instruction will be put ahead of the other one. Assume the
2520 // frequency of those two branches are x and y, where x is the frequency
2521 // of the edge to the chain head, then the cost will be
2522 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2523 // 3. If the tail node has more than two successors (this rarely happens),
2524 // we won't consider any additional cost.
2525 if (TailBB->isSuccessor(*Iter)) {
2526 auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2527 if (TailBB->succ_size() == 1)
2528 Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost);
2529 else if (TailBB->succ_size() == 2) {
2530 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2531 auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2532 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2533 ? TailBBFreq * TailToHeadProb.getCompl()
2534 : TailToHeadFreq;
2535 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2536 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2537 }
2538 }
2539
2540 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2541 << getBlockName(*Iter) << " to the top: "
2542 << printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2543
2544 if (Cost < SmallestRotationCost) {
2545 SmallestRotationCost = Cost;
2546 RotationPos = Iter;
2547 }
2548 }
2549
2550 if (RotationPos != LoopChain.end()) {
2551 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2552 << " to the top\n");
2553 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2554 }
2555}
2556
2557/// Collect blocks in the given loop that are to be placed.
2558///
2559/// When profile data is available, exclude cold blocks from the returned set;
2560/// otherwise, collect all blocks in the loop.
2562MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2563 BlockFilterSet LoopBlockSet;
2564
2565 // Filter cold blocks off from LoopBlockSet when profile data is available.
2566 // Collect the sum of frequencies of incoming edges to the loop header from
2567 // outside. If we treat the loop as a super block, this is the frequency of
2568 // the loop. Then for each block in the loop, we calculate the ratio between
2569 // its frequency and the frequency of the loop block. When it is too small,
2570 // don't add it to the loop chain. If there are outer loops, then this block
2571 // will be merged into the first outer loop chain for which this block is not
2572 // cold anymore. This needs precise profile data and we only do this when
2573 // profile data is available.
2574 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2575 BlockFrequency LoopFreq(0);
2576 for (auto *LoopPred : L.getHeader()->predecessors())
2577 if (!L.contains(LoopPred))
2578 LoopFreq += MBFI->getBlockFreq(LoopPred) *
2579 MBPI->getEdgeProbability(LoopPred, L.getHeader());
2580
2581 for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2582 if (LoopBlockSet.count(LoopBB))
2583 continue;
2584 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2585 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2586 continue;
2587 BlockChain *Chain = BlockToChain[LoopBB];
2588 for (MachineBasicBlock *ChainBB : *Chain)
2589 LoopBlockSet.insert(ChainBB);
2590 }
2591 } else
2592 LoopBlockSet.insert(L.block_begin(), L.block_end());
2593
2594 return LoopBlockSet;
2595}
2596
2597/// Forms basic block chains from the natural loop structures.
2598///
2599/// These chains are designed to preserve the existing *structure* of the code
2600/// as much as possible. We can then stitch the chains together in a way which
2601/// both preserves the topological structure and minimizes taken conditional
2602/// branches.
2603void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2604 // First recurse through any nested loops, building chains for those inner
2605 // loops.
2606 for (const MachineLoop *InnerLoop : L)
2607 buildLoopChains(*InnerLoop);
2608
2609 assert(BlockWorkList.empty() &&
2610 "BlockWorkList not empty when starting to build loop chains.");
2611 assert(EHPadWorkList.empty() &&
2612 "EHPadWorkList not empty when starting to build loop chains.");
2613 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2614
2615 // Check if we have profile data for this function. If yes, we will rotate
2616 // this loop by modeling costs more precisely which requires the profile data
2617 // for better layout.
2618 bool RotateLoopWithProfile =
2620 (PreciseRotationCost && F->getFunction().hasProfileData());
2621
2622 // First check to see if there is an obviously preferable top block for the
2623 // loop. This will default to the header, but may end up as one of the
2624 // predecessors to the header if there is one which will result in strictly
2625 // fewer branches in the loop body.
2626 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2627
2628 // If we selected just the header for the loop top, look for a potentially
2629 // profitable exit block in the event that rotating the loop can eliminate
2630 // branches by placing an exit edge at the bottom.
2631 //
2632 // Loops are processed innermost to uttermost, make sure we clear
2633 // PreferredLoopExit before processing a new loop.
2634 PreferredLoopExit = nullptr;
2635 BlockFrequency ExitFreq;
2636 if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2637 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2638
2639 BlockChain &LoopChain = *BlockToChain[LoopTop];
2640
2641 // FIXME: This is a really lame way of walking the chains in the loop: we
2642 // walk the blocks, and use a set to prevent visiting a particular chain
2643 // twice.
2644 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2645 assert(LoopChain.UnscheduledPredecessors == 0 &&
2646 "LoopChain should not have unscheduled predecessors.");
2647 UpdatedPreds.insert(&LoopChain);
2648
2649 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2650 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2651
2652 buildChain(LoopTop, LoopChain, &LoopBlockSet);
2653
2654 if (RotateLoopWithProfile)
2655 rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2656 else
2657 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2658
2659 LLVM_DEBUG({
2660 // Crash at the end so we get all of the debugging output first.
2661 bool BadLoop = false;
2662 if (LoopChain.UnscheduledPredecessors) {
2663 BadLoop = true;
2664 dbgs() << "Loop chain contains a block without its preds placed!\n"
2665 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2666 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2667 }
2668 for (MachineBasicBlock *ChainBB : LoopChain) {
2669 dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2670 if (!LoopBlockSet.remove(ChainBB)) {
2671 // We don't mark the loop as bad here because there are real situations
2672 // where this can occur. For example, with an unanalyzable fallthrough
2673 // from a loop block to a non-loop block or vice versa.
2674 dbgs() << "Loop chain contains a block not contained by the loop!\n"
2675 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2676 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2677 << " Bad block: " << getBlockName(ChainBB) << "\n";
2678 }
2679 }
2680
2681 if (!LoopBlockSet.empty()) {
2682 BadLoop = true;
2683 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2684 dbgs() << "Loop contains blocks never placed into a chain!\n"
2685 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2686 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2687 << " Bad block: " << getBlockName(LoopBB) << "\n";
2688 }
2689 assert(!BadLoop && "Detected problems with the placement of this loop.");
2690 });
2691
2692 BlockWorkList.clear();
2693 EHPadWorkList.clear();
2694}
2695
2696void MachineBlockPlacement::buildCFGChains() {
2697 // Ensure that every BB in the function has an associated chain to simplify
2698 // the assumptions of the remaining algorithm.
2699 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2700 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2701 ++FI) {
2702 MachineBasicBlock *BB = &*FI;
2703 BlockChain *Chain =
2704 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2705 // Also, merge any blocks which we cannot reason about and must preserve
2706 // the exact fallthrough behavior for.
2707 while (true) {
2708 Cond.clear();
2709 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2710 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2711 break;
2712
2713 MachineFunction::iterator NextFI = std::next(FI);
2714 MachineBasicBlock *NextBB = &*NextFI;
2715 // Ensure that the layout successor is a viable block, as we know that
2716 // fallthrough is a possibility.
2717 assert(NextFI != FE && "Can't fallthrough past the last block.");
2718 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2719 << getBlockName(BB) << " -> " << getBlockName(NextBB)
2720 << "\n");
2721 Chain->merge(NextBB, nullptr);
2722#ifndef NDEBUG
2723 BlocksWithUnanalyzableExits.insert(&*BB);
2724#endif
2725 FI = NextFI;
2726 BB = NextBB;
2727 }
2728 }
2729
2730 // Build any loop-based chains.
2731 PreferredLoopExit = nullptr;
2732 for (MachineLoop *L : *MLI)
2733 buildLoopChains(*L);
2734
2735 assert(BlockWorkList.empty() &&
2736 "BlockWorkList should be empty before building final chain.");
2737 assert(EHPadWorkList.empty() &&
2738 "EHPadWorkList should be empty before building final chain.");
2739
2740 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2741 for (MachineBasicBlock &MBB : *F)
2742 fillWorkLists(&MBB, UpdatedPreds);
2743
2744 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2745 buildChain(&F->front(), FunctionChain);
2746
2747#ifndef NDEBUG
2748 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2749#endif
2750 LLVM_DEBUG({
2751 // Crash at the end so we get all of the debugging output first.
2752 bool BadFunc = false;
2753 FunctionBlockSetType FunctionBlockSet;
2754 for (MachineBasicBlock &MBB : *F)
2755 FunctionBlockSet.insert(&MBB);
2756
2757 for (MachineBasicBlock *ChainBB : FunctionChain)
2758 if (!FunctionBlockSet.erase(ChainBB)) {
2759 BadFunc = true;
2760 dbgs() << "Function chain contains a block not in the function!\n"
2761 << " Bad block: " << getBlockName(ChainBB) << "\n";
2762 }
2763
2764 if (!FunctionBlockSet.empty()) {
2765 BadFunc = true;
2766 for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2767 dbgs() << "Function contains blocks never placed into a chain!\n"
2768 << " Bad block: " << getBlockName(RemainingBB) << "\n";
2769 }
2770 assert(!BadFunc && "Detected problems with the block placement.");
2771 });
2772
2773 // Remember original layout ordering, so we can update terminators after
2774 // reordering to point to the original layout successor.
2775 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2776 F->getNumBlockIDs());
2777 {
2778 MachineBasicBlock *LastMBB = nullptr;
2779 for (auto &MBB : *F) {
2780 if (LastMBB != nullptr)
2781 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2782 LastMBB = &MBB;
2783 }
2784 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2785 }
2786
2787 // Splice the blocks into place.
2788 MachineFunction::iterator InsertPos = F->begin();
2789 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2790 for (MachineBasicBlock *ChainBB : FunctionChain) {
2791 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2792 : " ... ")
2793 << getBlockName(ChainBB) << "\n");
2794 if (InsertPos != MachineFunction::iterator(ChainBB))
2795 F->splice(InsertPos, ChainBB);
2796 else
2797 ++InsertPos;
2798
2799 // Update the terminator of the previous block.
2800 if (ChainBB == *FunctionChain.begin())
2801 continue;
2802 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2803
2804 // FIXME: It would be awesome of updateTerminator would just return rather
2805 // than assert when the branch cannot be analyzed in order to remove this
2806 // boiler plate.
2807 Cond.clear();
2808 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2809
2810#ifndef NDEBUG
2811 if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2812 // Given the exact block placement we chose, we may actually not _need_ to
2813 // be able to edit PrevBB's terminator sequence, but not being _able_ to
2814 // do that at this point is a bug.
2815 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2816 !PrevBB->canFallThrough()) &&
2817 "Unexpected block with un-analyzable fallthrough!");
2818 Cond.clear();
2819 TBB = FBB = nullptr;
2820 }
2821#endif
2822
2823 // The "PrevBB" is not yet updated to reflect current code layout, so,
2824 // o. it may fall-through to a block without explicit "goto" instruction
2825 // before layout, and no longer fall-through it after layout; or
2826 // o. just opposite.
2827 //
2828 // analyzeBranch() may return erroneous value for FBB when these two
2829 // situations take place. For the first scenario FBB is mistakenly set NULL;
2830 // for the 2nd scenario, the FBB, which is expected to be NULL, is
2831 // mistakenly pointing to "*BI".
2832 // Thus, if the future change needs to use FBB before the layout is set, it
2833 // has to correct FBB first by using the code similar to the following:
2834 //
2835 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2836 // PrevBB->updateTerminator();
2837 // Cond.clear();
2838 // TBB = FBB = nullptr;
2839 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2840 // // FIXME: This should never take place.
2841 // TBB = FBB = nullptr;
2842 // }
2843 // }
2844 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2845 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2846 }
2847 }
2848
2849 // Fixup the last block.
2850 Cond.clear();
2851 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2852 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2853 MachineBasicBlock *PrevBB = &F->back();
2854 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2855 }
2856
2857 BlockWorkList.clear();
2858 EHPadWorkList.clear();
2859}
2860
2861void MachineBlockPlacement::optimizeBranches() {
2862 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2863 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2864
2865 // Now that all the basic blocks in the chain have the proper layout,
2866 // make a final call to analyzeBranch with AllowModify set.
2867 // Indeed, the target may be able to optimize the branches in a way we
2868 // cannot because all branches may not be analyzable.
2869 // E.g., the target may be able to remove an unconditional branch to
2870 // a fallthrough when it occurs after predicated terminators.
2871 for (MachineBasicBlock *ChainBB : FunctionChain) {
2872 Cond.clear();
2873 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2874 if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2875 // If PrevBB has a two-way branch, try to re-order the branches
2876 // such that we branch to the successor with higher probability first.
2877 if (TBB && !Cond.empty() && FBB &&
2878 MBPI->getEdgeProbability(ChainBB, FBB) >
2879 MBPI->getEdgeProbability(ChainBB, TBB) &&
2881 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2882 << getBlockName(ChainBB) << "\n");
2883 LLVM_DEBUG(dbgs() << " Edge probability: "
2884 << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2885 << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2886 DebugLoc dl; // FIXME: this is nowhere
2887 TII->removeBranch(*ChainBB);
2888 TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2889 }
2890 }
2891 }
2892}
2893
2894void MachineBlockPlacement::alignBlocks() {
2895 // Walk through the backedges of the function now that we have fully laid out
2896 // the basic blocks and align the destination of each backedge. We don't rely
2897 // exclusively on the loop info here so that we can align backedges in
2898 // unnatural CFGs and backedges that were introduced purely because of the
2899 // loop rotations done during this layout pass.
2900 if (F->getFunction().hasMinSize() ||
2901 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2902 return;
2903 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2904 if (FunctionChain.begin() == FunctionChain.end())
2905 return; // Empty chain.
2906
2907 const BranchProbability ColdProb(1, 5); // 20%
2908 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2909 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2910 for (MachineBasicBlock *ChainBB : FunctionChain) {
2911 if (ChainBB == *FunctionChain.begin())
2912 continue;
2913
2914 // Don't align non-looping basic blocks. These are unlikely to execute
2915 // enough times to matter in practice. Note that we'll still handle
2916 // unnatural CFGs inside of a natural outer loop (the common case) and
2917 // rotated loops.
2918 MachineLoop *L = MLI->getLoopFor(ChainBB);
2919 if (!L)
2920 continue;
2921
2922 const Align TLIAlign = TLI->getPrefLoopAlignment(L);
2923 unsigned MDAlign = 1;
2924 MDNode *LoopID = L->getLoopID();
2925 if (LoopID) {
2926 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
2927 MDNode *MD = dyn_cast<MDNode>(MDO);
2928 if (MD == nullptr)
2929 continue;
2930 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
2931 if (S == nullptr)
2932 continue;
2933 if (S->getString() == "llvm.loop.align") {
2934 assert(MD->getNumOperands() == 2 &&
2935 "per-loop align metadata should have two operands.");
2936 MDAlign =
2937 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
2938 assert(MDAlign >= 1 && "per-loop align value must be positive.");
2939 }
2940 }
2941 }
2942
2943 // Use max of the TLIAlign and MDAlign
2944 const Align LoopAlign = std::max(TLIAlign, Align(MDAlign));
2945 if (LoopAlign == 1)
2946 continue; // Don't care about loop alignment.
2947
2948 // If the block is cold relative to the function entry don't waste space
2949 // aligning it.
2950 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2951 if (Freq < WeightedEntryFreq)
2952 continue;
2953
2954 // If the block is cold relative to its loop header, don't align it
2955 // regardless of what edges into the block exist.
2956 MachineBasicBlock *LoopHeader = L->getHeader();
2957 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2958 if (Freq < (LoopHeaderFreq * ColdProb))
2959 continue;
2960
2961 // If the global profiles indicates so, don't align it.
2962 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
2963 !TLI->alignLoopsWithOptSize())
2964 continue;
2965
2966 // Check for the existence of a non-layout predecessor which would benefit
2967 // from aligning this block.
2968 MachineBasicBlock *LayoutPred =
2969 &*std::prev(MachineFunction::iterator(ChainBB));
2970
2971 auto DetermineMaxAlignmentPadding = [&]() {
2972 // Set the maximum bytes allowed to be emitted for alignment.
2973 unsigned MaxBytes;
2974 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
2976 else
2977 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
2978 ChainBB->setMaxBytesForAlignment(MaxBytes);
2979 };
2980
2981 // Force alignment if all the predecessors are jumps. We already checked
2982 // that the block isn't cold above.
2983 if (!LayoutPred->isSuccessor(ChainBB)) {
2984 ChainBB->setAlignment(LoopAlign);
2985 DetermineMaxAlignmentPadding();
2986 continue;
2987 }
2988
2989 // Align this block if the layout predecessor's edge into this block is
2990 // cold relative to the block. When this is true, other predecessors make up
2991 // all of the hot entries into the block and thus alignment is likely to be
2992 // important.
2993 BranchProbability LayoutProb =
2994 MBPI->getEdgeProbability(LayoutPred, ChainBB);
2995 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2996 if (LayoutEdgeFreq <= (Freq * ColdProb)) {
2997 ChainBB->setAlignment(LoopAlign);
2998 DetermineMaxAlignmentPadding();
2999 }
3000 }
3001}
3002
3003/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3004/// it was duplicated into its chain predecessor and removed.
3005/// \p BB - Basic block that may be duplicated.
3006///
3007/// \p LPred - Chosen layout predecessor of \p BB.
3008/// Updated to be the chain end if LPred is removed.
3009/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3010/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3011/// Used to identify which blocks to update predecessor
3012/// counts.
3013/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3014/// chosen in the given order due to unnatural CFG
3015/// only needed if \p BB is removed and
3016/// \p PrevUnplacedBlockIt pointed to \p BB.
3017/// @return true if \p BB was removed.
3018bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3020 const MachineBasicBlock *LoopHeaderBB,
3021 BlockChain &Chain, BlockFilterSet *BlockFilter,
3022 MachineFunction::iterator &PrevUnplacedBlockIt) {
3023 bool Removed, DuplicatedToLPred;
3024 bool DuplicatedToOriginalLPred;
3025 Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
3026 PrevUnplacedBlockIt,
3027 DuplicatedToLPred);
3028 if (!Removed)
3029 return false;
3030 DuplicatedToOriginalLPred = DuplicatedToLPred;
3031 // Iteratively try to duplicate again. It can happen that a block that is
3032 // duplicated into is still small enough to be duplicated again.
3033 // No need to call markBlockSuccessors in this case, as the blocks being
3034 // duplicated from here on are already scheduled.
3035 while (DuplicatedToLPred && Removed) {
3036 MachineBasicBlock *DupBB, *DupPred;
3037 // The removal callback causes Chain.end() to be updated when a block is
3038 // removed. On the first pass through the loop, the chain end should be the
3039 // same as it was on function entry. On subsequent passes, because we are
3040 // duplicating the block at the end of the chain, if it is removed the
3041 // chain will have shrunk by one block.
3042 BlockChain::iterator ChainEnd = Chain.end();
3043 DupBB = *(--ChainEnd);
3044 // Now try to duplicate again.
3045 if (ChainEnd == Chain.begin())
3046 break;
3047 DupPred = *std::prev(ChainEnd);
3048 Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
3049 PrevUnplacedBlockIt,
3050 DuplicatedToLPred);
3051 }
3052 // If BB was duplicated into LPred, it is now scheduled. But because it was
3053 // removed, markChainSuccessors won't be called for its chain. Instead we
3054 // call markBlockSuccessors for LPred to achieve the same effect. This must go
3055 // at the end because repeating the tail duplication can increase the number
3056 // of unscheduled predecessors.
3057 LPred = *std::prev(Chain.end());
3058 if (DuplicatedToOriginalLPred)
3059 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3060 return true;
3061}
3062
3063/// Tail duplicate \p BB into (some) predecessors if profitable.
3064/// \p BB - Basic block that may be duplicated
3065/// \p LPred - Chosen layout predecessor of \p BB
3066/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3067/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3068/// Used to identify which blocks to update predecessor
3069/// counts.
3070/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3071/// chosen in the given order due to unnatural CFG
3072/// only needed if \p BB is removed and
3073/// \p PrevUnplacedBlockIt pointed to \p BB.
3074/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3075/// \return - True if the block was duplicated into all preds and removed.
3076bool MachineBlockPlacement::maybeTailDuplicateBlock(
3078 BlockChain &Chain, BlockFilterSet *BlockFilter,
3079 MachineFunction::iterator &PrevUnplacedBlockIt,
3080 bool &DuplicatedToLPred) {
3081 DuplicatedToLPred = false;
3082 if (!shouldTailDuplicate(BB))
3083 return false;
3084
3085 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3086 << "\n");
3087
3088 // This has to be a callback because none of it can be done after
3089 // BB is deleted.
3090 bool Removed = false;
3091 auto RemovalCallback =
3092 [&](MachineBasicBlock *RemBB) {
3093 // Signal to outer function
3094 Removed = true;
3095
3096 // Conservative default.
3097 bool InWorkList = true;
3098 // Remove from the Chain and Chain Map
3099 if (BlockToChain.count(RemBB)) {
3100 BlockChain *Chain = BlockToChain[RemBB];
3101 InWorkList = Chain->UnscheduledPredecessors == 0;
3102 Chain->remove(RemBB);
3103 BlockToChain.erase(RemBB);
3104 }
3105
3106 // Handle the unplaced block iterator
3107 if (&(*PrevUnplacedBlockIt) == RemBB) {
3108 PrevUnplacedBlockIt++;
3109 }
3110
3111 // Handle the Work Lists
3112 if (InWorkList) {
3113 SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
3114 if (RemBB->isEHPad())
3115 RemoveList = EHPadWorkList;
3116 llvm::erase(RemoveList, RemBB);
3117 }
3118
3119 // Handle the filter set
3120 if (BlockFilter) {
3121 BlockFilter->remove(RemBB);
3122 }
3123
3124 // Remove the block from loop info.
3125 MLI->removeBlock(RemBB);
3126 if (RemBB == PreferredLoopExit)
3127 PreferredLoopExit = nullptr;
3128
3129 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
3130 << getBlockName(RemBB) << "\n");
3131 };
3132 auto RemovalCallbackRef =
3134
3136 bool IsSimple = TailDup.isSimpleBB(BB);
3138 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3139 if (F->getFunction().hasProfileData()) {
3140 // We can do partial duplication with precise profile information.
3141 findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3142 if (CandidatePreds.size() == 0)
3143 return false;
3144 if (CandidatePreds.size() < BB->pred_size())
3145 CandidatePtr = &CandidatePreds;
3146 }
3147 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3148 &RemovalCallbackRef, CandidatePtr);
3149
3150 // Update UnscheduledPredecessors to reflect tail-duplication.
3151 DuplicatedToLPred = false;
3152 for (MachineBasicBlock *Pred : DuplicatedPreds) {
3153 // We're only looking for unscheduled predecessors that match the filter.
3154 BlockChain* PredChain = BlockToChain[Pred];
3155 if (Pred == LPred)
3156 DuplicatedToLPred = true;
3157 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
3158 || PredChain == &Chain)
3159 continue;
3160 for (MachineBasicBlock *NewSucc : Pred->successors()) {
3161 if (BlockFilter && !BlockFilter->count(NewSucc))
3162 continue;
3163 BlockChain *NewChain = BlockToChain[NewSucc];
3164 if (NewChain != &Chain && NewChain != PredChain)
3165 NewChain->UnscheduledPredecessors++;
3166 }
3167 }
3168 return Removed;
3169}
3170
3171// Count the number of actual machine instructions.
3173 uint64_t InstrCount = 0;
3174 for (MachineInstr &MI : *MBB) {
3175 if (!MI.isPHI() && !MI.isMetaInstruction())
3176 InstrCount += 1;
3177 }
3178 return InstrCount;
3179}
3180
3181// The size cost of duplication is the instruction size of the duplicated block.
3182// So we should scale the threshold accordingly. But the instruction size is not
3183// available on all targets, so we use the number of instructions instead.
3184BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3185 return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB));
3186}
3187
3188// Returns true if BB is Pred's best successor.
3189bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3190 MachineBasicBlock *Pred,
3191 BlockFilterSet *BlockFilter) {
3192 if (BB == Pred)
3193 return false;
3194 if (BlockFilter && !BlockFilter->count(Pred))
3195 return false;
3196 BlockChain *PredChain = BlockToChain[Pred];
3197 if (PredChain && (Pred != *std::prev(PredChain->end())))
3198 return false;
3199
3200 // Find the successor with largest probability excluding BB.
3202 for (MachineBasicBlock *Succ : Pred->successors())
3203 if (Succ != BB) {
3204 if (BlockFilter && !BlockFilter->count(Succ))
3205 continue;
3206 BlockChain *SuccChain = BlockToChain[Succ];
3207 if (SuccChain && (Succ != *SuccChain->begin()))
3208 continue;
3209 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3210 if (SuccProb > BestProb)
3211 BestProb = SuccProb;
3212 }
3213
3214 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3215 if (BBProb <= BestProb)
3216 return false;
3217
3218 // Compute the number of reduced taken branches if Pred falls through to BB
3219 // instead of another successor. Then compare it with threshold.
3220 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3221 BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3222 return Gain > scaleThreshold(BB);
3223}
3224
3225// Find out the predecessors of BB and BB can be beneficially duplicated into
3226// them.
3227void MachineBlockPlacement::findDuplicateCandidates(
3230 BlockFilterSet *BlockFilter) {
3231 MachineBasicBlock *Fallthrough = nullptr;
3232 BranchProbability DefaultBranchProb = BranchProbability::getZero();
3233 BlockFrequency BBDupThreshold(scaleThreshold(BB));
3236
3237 // Sort for highest frequency.
3238 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3239 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3240 };
3241 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3242 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3243 };
3244 llvm::stable_sort(Succs, CmpSucc);
3245 llvm::stable_sort(Preds, CmpPred);
3246
3247 auto SuccIt = Succs.begin();
3248 if (SuccIt != Succs.end()) {
3249 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3250 }
3251
3252 // For each predecessors of BB, compute the benefit of duplicating BB,
3253 // if it is larger than the threshold, add it into Candidates.
3254 //
3255 // If we have following control flow.
3256 //
3257 // PB1 PB2 PB3 PB4
3258 // \ | / /\
3259 // \ | / / \
3260 // \ |/ / \
3261 // BB----/ OB
3262 // /\
3263 // / \
3264 // SB1 SB2
3265 //
3266 // And it can be partially duplicated as
3267 //
3268 // PB2+BB
3269 // | PB1 PB3 PB4
3270 // | | / /\
3271 // | | / / \
3272 // | |/ / \
3273 // | BB----/ OB
3274 // |\ /|
3275 // | X |
3276 // |/ \|
3277 // SB2 SB1
3278 //
3279 // The benefit of duplicating into a predecessor is defined as
3280 // Orig_taken_branch - Duplicated_taken_branch
3281 //
3282 // The Orig_taken_branch is computed with the assumption that predecessor
3283 // jumps to BB and the most possible successor is laid out after BB.
3284 //
3285 // The Duplicated_taken_branch is computed with the assumption that BB is
3286 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3287 // SB2 for PB2 in our case). If there is no available successor, the combined
3288 // block jumps to all BB's successor, like PB3 in this example.
3289 //
3290 // If a predecessor has multiple successors, so BB can't be duplicated into
3291 // it. But it can beneficially fall through to BB, and duplicate BB into other
3292 // predecessors.
3293 for (MachineBasicBlock *Pred : Preds) {
3294 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3295
3296 if (!TailDup.canTailDuplicate(BB, Pred)) {
3297 // BB can't be duplicated into Pred, but it is possible to be layout
3298 // below Pred.
3299 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3300 Fallthrough = Pred;
3301 if (SuccIt != Succs.end())
3302 SuccIt++;
3303 }
3304 continue;
3305 }
3306
3307 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3308 BlockFrequency DupCost;
3309 if (SuccIt == Succs.end()) {
3310 // Jump to all successors;
3311 if (Succs.size() > 0)
3312 DupCost += PredFreq;
3313 } else {
3314 // Fallthrough to *SuccIt, jump to all other successors;
3315 DupCost += PredFreq;
3316 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3317 }
3318
3319 assert(OrigCost >= DupCost);
3320 OrigCost -= DupCost;
3321 if (OrigCost > BBDupThreshold) {
3322 Candidates.push_back(Pred);
3323 if (SuccIt != Succs.end())
3324 SuccIt++;
3325 }
3326 }
3327
3328 // No predecessors can optimally fallthrough to BB.
3329 // So we can change one duplication into fallthrough.
3330 if (!Fallthrough) {
3331 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3332 Candidates[0] = Candidates.back();
3333 Candidates.pop_back();
3334 }
3335 }
3336}
3337
3338void MachineBlockPlacement::initDupThreshold() {
3339 DupThreshold = BlockFrequency(0);
3340 if (!F->getFunction().hasProfileData())
3341 return;
3342
3343 // We prefer to use prifile count.
3344 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3345 if (HotThreshold != UINT64_MAX) {
3346 UseProfileCount = true;
3347 DupThreshold =
3348 BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
3349 return;
3350 }
3351
3352 // Profile count is not available, we can use block frequency instead.
3353 BlockFrequency MaxFreq = BlockFrequency(0);
3354 for (MachineBasicBlock &MBB : *F) {
3355 BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3356 if (Freq > MaxFreq)
3357 MaxFreq = Freq;
3358 }
3359
3360 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3361 DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3362 UseProfileCount = false;
3363}
3364
3365bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3366 if (skipFunction(MF.getFunction()))
3367 return false;
3368
3369 // Check for single-block functions and skip them.
3370 if (std::next(MF.begin()) == MF.end())
3371 return false;
3372
3373 F = &MF;
3374 MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3375 MBFI = std::make_unique<MBFIWrapper>(
3376 getAnalysis<MachineBlockFrequencyInfo>());
3377 MLI = &getAnalysis<MachineLoopInfo>();
3378 TII = MF.getSubtarget().getInstrInfo();
3379 TLI = MF.getSubtarget().getTargetLowering();
3380 MPDT = nullptr;
3381 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
3382
3383 initDupThreshold();
3384
3385 // Initialize PreferredLoopExit to nullptr here since it may never be set if
3386 // there are no MachineLoops.
3387 PreferredLoopExit = nullptr;
3388
3389 assert(BlockToChain.empty() &&
3390 "BlockToChain map should be empty before starting placement.");
3391 assert(ComputedEdges.empty() &&
3392 "Computed Edge map should be empty before starting placement.");
3393
3394 unsigned TailDupSize = TailDupPlacementThreshold;
3395 // If only the aggressive threshold is explicitly set, use it.
3396 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3397 TailDupPlacementThreshold.getNumOccurrences() == 0)
3399
3400 TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
3401 // For aggressive optimization, we can adjust some thresholds to be less
3402 // conservative.
3403 if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) {
3404 // At O3 we should be more willing to copy blocks for tail duplication. This
3405 // increases size pressure, so we only do it at O3
3406 // Do this unless only the regular threshold is explicitly set.
3407 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3408 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3410 }
3411
3412 // If there's no threshold provided through options, query the target
3413 // information for a threshold instead.
3414 if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3415 (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive ||
3416 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3417 TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
3418
3419 if (allowTailDupPlacement()) {
3420 MPDT = &getAnalysis<MachinePostDominatorTree>();
3421 bool OptForSize = MF.getFunction().hasOptSize() ||
3422 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3423 if (OptForSize)
3424 TailDupSize = 1;
3425 bool PreRegAlloc = false;
3426 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3427 /* LayoutMode */ true, TailDupSize);
3428 precomputeTriangleChains();
3429 }
3430
3431 buildCFGChains();
3432
3433 // Changing the layout can create new tail merging opportunities.
3434 // TailMerge can create jump into if branches that make CFG irreducible for
3435 // HW that requires structured CFG.
3436 bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3437 PassConfig->getEnableTailMerge() &&
3439 // No tail merging opportunities if the block number is less than four.
3440 if (MF.size() > 3 && EnableTailMerge) {
3441 unsigned TailMergeSize = TailDupSize + 1;
3442 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3443 *MBFI, *MBPI, PSI, TailMergeSize);
3444
3445 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3446 /*AfterPlacement=*/true)) {
3447 // Redo the layout if tail merging creates/removes/moves blocks.
3448 BlockToChain.clear();
3449 ComputedEdges.clear();
3450 // Must redo the post-dominator tree if blocks were changed.
3451 if (MPDT)
3452 MPDT->runOnMachineFunction(MF);
3453 ChainAllocator.DestroyAll();
3454 buildCFGChains();
3455 }
3456 }
3457
3458 // Apply a post-processing optimizing block placement.
3459 if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
3461 // Find a new placement and modify the layout of the blocks in the function.
3462 applyExtTsp();
3463
3464 // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3465 createCFGChainExtTsp();
3466 }
3467
3468 optimizeBranches();
3469 alignBlocks();
3470
3471 BlockToChain.clear();
3472 ComputedEdges.clear();
3473 ChainAllocator.DestroyAll();
3474
3475 bool HasMaxBytesOverride =
3476 MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3477
3478 if (AlignAllBlock)
3479 // Align all of the blocks in the function to a specific alignment.
3480 for (MachineBasicBlock &MBB : MF) {
3481 if (HasMaxBytesOverride)
3484 else
3486 }
3487 else if (AlignAllNonFallThruBlocks) {
3488 // Align all of the blocks that have no fall-through predecessors to a
3489 // specific alignment.
3490 for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
3491 auto LayoutPred = std::prev(MBI);
3492 if (!LayoutPred->isSuccessor(&*MBI)) {
3493 if (HasMaxBytesOverride)
3494 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3496 else
3497 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3498 }
3499 }
3500 }
3502 (ViewBlockFreqFuncName.empty() ||
3503 F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
3505 MF.RenumberBlocks();
3506 MBFI->view("MBP." + MF.getName(), false);
3507 }
3508
3509 // We always return true as we have no way to track whether the final order
3510 // differs from the original order.
3511 return true;
3512}
3513
3514void MachineBlockPlacement::applyExtTsp() {
3515 // Prepare data; blocks are indexed by their index in the current ordering.
3517 BlockIndex.reserve(F->size());
3518 std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3519 CurrentBlockOrder.reserve(F->size());
3520 size_t NumBlocks = 0;
3521 for (const MachineBasicBlock &MBB : *F) {
3522 BlockIndex[&MBB] = NumBlocks++;
3523 CurrentBlockOrder.push_back(&MBB);
3524 }
3525
3526 auto BlockSizes = std::vector<uint64_t>(F->size());
3527 auto BlockCounts = std::vector<uint64_t>(F->size());
3528 std::vector<codelayout::EdgeCount> JumpCounts;
3529 for (MachineBasicBlock &MBB : *F) {
3530 // Getting the block frequency.
3531 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3532 BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
3533 // Getting the block size:
3534 // - approximate the size of an instruction by 4 bytes, and
3535 // - ignore debug instructions.
3536 // Note: getting the exact size of each block is target-dependent and can be
3537 // done by extending the interface of MCCodeEmitter. Experimentally we do
3538 // not see a perf improvement with the exact block sizes.
3539 auto NonDbgInsts =
3541 int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3542 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3543 // Getting jump frequencies.
3544 for (MachineBasicBlock *Succ : MBB.successors()) {
3545 auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3546 BlockFrequency JumpFreq = BlockFreq * EP;
3547 JumpCounts.push_back(
3548 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3549 }
3550 }
3551
3552 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3553 << " with profile = " << F->getFunction().hasProfileData()
3554 << " (" << F->getName().str() << ")"
3555 << "\n");
3556 LLVM_DEBUG(
3557 dbgs() << format(" original layout score: %0.2f\n",
3558 calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
3559
3560 // Run the layout algorithm.
3561 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3562 std::vector<const MachineBasicBlock *> NewBlockOrder;
3563 NewBlockOrder.reserve(F->size());
3564 for (uint64_t Node : NewOrder) {
3565 NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3566 }
3567 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n",
3568 calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
3569 JumpCounts)));
3570
3571 // Assign new block order.
3572 assignBlockOrder(NewBlockOrder);
3573}
3574
3575void MachineBlockPlacement::assignBlockOrder(
3576 const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3577 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3578 F->RenumberBlocks();
3579
3580 bool HasChanges = false;
3581 for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3582 if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3583 HasChanges = true;
3584 break;
3585 }
3586 }
3587 // Stop early if the new block order is identical to the existing one.
3588 if (!HasChanges)
3589 return;
3590
3591 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3592 for (auto &MBB : *F) {
3593 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3594 }
3595
3596 // Sort basic blocks in the function according to the computed order.
3598 for (const MachineBasicBlock *MBB : NewBlockOrder) {
3599 NewIndex[MBB] = NewIndex.size();
3600 }
3601 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3602 return NewIndex[&L] < NewIndex[&R];
3603 });
3604
3605 // Update basic block branches by inserting explicit fallthrough branches
3606 // when required and re-optimize branches when possible.
3607 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3609 for (auto &MBB : *F) {
3610 MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3612 auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3613 // If this block had a fallthrough before we need an explicit unconditional
3614 // branch to that block if the fallthrough block is not adjacent to the
3615 // block in the new order.
3616 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3617 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3618 }
3619
3620 // It might be possible to optimize branches by flipping the condition.
3621 Cond.clear();
3622 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3623 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3624 continue;
3625 MBB.updateTerminator(FTMBB);
3626 }
3627
3628#ifndef NDEBUG
3629 // Make sure we correctly constructed all branches.
3630 F->verify(this, "After optimized block reordering");
3631#endif
3632}
3633
3634void MachineBlockPlacement::createCFGChainExtTsp() {
3635 BlockToChain.clear();
3636 ComputedEdges.clear();
3637 ChainAllocator.DestroyAll();
3638
3639 MachineBasicBlock *HeadBB = &F->front();
3640 BlockChain *FunctionChain =
3641 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3642
3643 for (MachineBasicBlock &MBB : *F) {
3644 if (HeadBB == &MBB)
3645 continue; // Ignore head of the chain
3646 FunctionChain->merge(&MBB, nullptr);
3647 }
3648}
3649
3650namespace {
3651
3652/// A pass to compute block placement statistics.
3653///
3654/// A separate pass to compute interesting statistics for evaluating block
3655/// placement. This is separate from the actual placement pass so that they can
3656/// be computed in the absence of any placement transformations or when using
3657/// alternative placement strategies.
3658class MachineBlockPlacementStats : public MachineFunctionPass {
3659 /// A handle to the branch probability pass.
3660 const MachineBranchProbabilityInfo *MBPI;
3661
3662 /// A handle to the function-wide block frequency pass.
3663 const MachineBlockFrequencyInfo *MBFI;
3664
3665public:
3666 static char ID; // Pass identification, replacement for typeid
3667
3668 MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3670 }
3671
3672 bool runOnMachineFunction(MachineFunction &F) override;
3673
3674 void getAnalysisUsage(AnalysisUsage &AU) const override {
3677 AU.setPreservesAll();
3679 }
3680};
3681
3682} // end anonymous namespace
3683
3684char MachineBlockPlacementStats::ID = 0;
3685
3686char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
3687
3688INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3689 "Basic Block Placement Stats", false, false)
3692INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3694
3695bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3696 // Check for single-block functions and skip them.
3697 if (std::next(F.begin()) == F.end())
3698 return false;
3699
3700 if (!isFunctionInPrintList(F.getName()))
3701 return false;
3702
3703 MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3704 MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
3705
3706 for (MachineBasicBlock &MBB : F) {
3707 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3708 Statistic &NumBranches =
3709 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3710 Statistic &BranchTakenFreq =
3711 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3712 for (MachineBasicBlock *Succ : MBB.successors()) {
3713 // Skip if this successor is a fallthrough.
3714 if (MBB.isLayoutSuccessor(Succ))
3715 continue;
3716
3717 BlockFrequency EdgeFreq =
3718 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3719 ++NumBranches;
3720 BranchTakenFreq += EdgeFreq.getFrequency();
3721 }
3722 }
3723
3724 return false;
3725}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
MachineBasicBlock & MBB
This file defines the BumpPtrAllocator interface.
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Declares methods and data structures for code layout algorithms.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529
static unsigned InstrCount
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
block placement Basic Block Placement Stats
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
Branch Probability Basic Block Placement
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
#define P(N)
if(VerifyEach)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
unify loop Fixup each natural loop to have a single exit block
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
static BlockFrequency max()
Returns the maximum possible frequency, the saturation value.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static BranchProbability getOne()
uint32_t getNumerator() const
BranchProbability getCompl() const
static BranchProbability getZero()
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:680
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:314
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
Metadata node.
Definition: Metadata.h:1067
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:607
unsigned pred_size() const
bool isEHPad() const
Returns true if the block is a landing pad.
instr_iterator instr_begin()
MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
unsigned succ_size() const
void setAlignment(Align A)
Set alignment of the basic block.
bool isEntryBlock() const
Returns true if this is the entry block of the function.
succ_reverse_iterator succ_rbegin()
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
iterator_range< pred_iterator > predecessors()
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
unsigned size() const
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
typename SuperClass::iterator iterator
Definition: SmallVector.h:590
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:382
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:432
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:403
Utility class to perform tail duplication.
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
TargetInstrInfo - Interface to description of machine instruction set.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
bool requiresStructuredCFG() const
Target-Independent Code Generator Pass Configuration Options.
bool getEnableTailMerge() const
CodeGenOptLevel getOptLevel() const
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetLowering * getTargetLowering() const
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
#define UINT64_MAX
Definition: DataTypes.h:77
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
double calcExtTspScore(ArrayRef< uint64_t > Order, ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Estimate the "quality" of a given node order in CFG.
std::vector< uint64_t > computeExtTspLayout(ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:457
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void stable_sort(R &&Range)
Definition: STLExtras.h:2004
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
cl::opt< bool > ApplyExtTspWithoutProfile
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
cl::opt< unsigned > ProfileLikelyProb
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2068
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
bool isFunctionInPrintList(StringRef FunctionName)
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
cl::opt< unsigned > StaticLikelyProb
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1923
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2060
Printable printBlockFreq(const BlockFrequencyInfo &BFI, BlockFrequency Freq)
Print the block frequency Freq relative to the current functions entry frequency.
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
cl::opt< bool > EnableExtTspBlockPlacement
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void initializeMachineBlockPlacementPass(PassRegistry &)
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39