LLVM 18.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements basic block placement transformations using the CFG
10// structure and branch probability estimates.
11//
12// The pass strives to preserve the structure of the CFG (that is, retain
13// a topological ordering of basic blocks) in the absence of a *strong* signal
14// to the contrary from probabilities. However, within the CFG structure, it
15// attempts to choose an ordering which favors placing more likely sequences of
16// blocks adjacent to each other.
17//
18// The algorithm works from the inner-most loop within a function outward, and
19// at each stage walks through the basic blocks, trying to coalesce them into
20// sequential chains where allowed by the CFG (or demanded by heavy
21// probabilities). Finally, it walks the blocks in topological order, and the
22// first time it reaches a chain of basic blocks, it schedules them in the
23// function in-order.
24//
25//===----------------------------------------------------------------------===//
26
27#include "BranchFolding.h"
28#include "llvm/ADT/ArrayRef.h"
29#include "llvm/ADT/DenseMap.h"
30#include "llvm/ADT/STLExtras.h"
31#include "llvm/ADT/SetVector.h"
34#include "llvm/ADT/Statistic.h"
51#include "llvm/IR/DebugLoc.h"
52#include "llvm/IR/Function.h"
53#include "llvm/IR/PrintPasses.h"
55#include "llvm/Pass.h"
62#include "llvm/Support/Debug.h"
66#include <algorithm>
67#include <cassert>
68#include <cstdint>
69#include <iterator>
70#include <memory>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76using namespace llvm;
77
78#define DEBUG_TYPE "block-placement"
79
80STATISTIC(NumCondBranches, "Number of conditional branches");
81STATISTIC(NumUncondBranches, "Number of unconditional branches");
82STATISTIC(CondBranchTakenFreq,
83 "Potential frequency of taking conditional branches");
84STATISTIC(UncondBranchTakenFreq,
85 "Potential frequency of taking unconditional branches");
86
88 "align-all-blocks",
89 cl::desc("Force the alignment of all blocks in the function in log2 format "
90 "(e.g 4 means align on 16B boundaries)."),
92
94 "align-all-nofallthru-blocks",
95 cl::desc("Force the alignment of all blocks that have no fall-through "
96 "predecessors (i.e. don't add nops that are executed). In log2 "
97 "format (e.g 4 means align on 16B boundaries)."),
99
101 "max-bytes-for-alignment",
102 cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
103 "alignment"),
104 cl::init(0), cl::Hidden);
105
106// FIXME: Find a good default for this flag and remove the flag.
108 "block-placement-exit-block-bias",
109 cl::desc("Block frequency percentage a loop exit block needs "
110 "over the original exit to be considered the new exit."),
111 cl::init(0), cl::Hidden);
112
113// Definition:
114// - Outlining: placement of a basic block outside the chain or hot path.
115
117 "loop-to-cold-block-ratio",
118 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
119 "(frequency of block) is greater than this ratio"),
120 cl::init(5), cl::Hidden);
121
123 "force-loop-cold-block",
124 cl::desc("Force outlining cold blocks from loops."),
125 cl::init(false), cl::Hidden);
126
127static cl::opt<bool>
128 PreciseRotationCost("precise-rotation-cost",
129 cl::desc("Model the cost of loop rotation more "
130 "precisely by using profile data."),
131 cl::init(false), cl::Hidden);
132
133static cl::opt<bool>
134 ForcePreciseRotationCost("force-precise-rotation-cost",
135 cl::desc("Force the use of precise cost "
136 "loop rotation strategy."),
137 cl::init(false), cl::Hidden);
138
140 "misfetch-cost",
141 cl::desc("Cost that models the probabilistic risk of an instruction "
142 "misfetch due to a jump comparing to falling through, whose cost "
143 "is zero."),
144 cl::init(1), cl::Hidden);
145
146static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
147 cl::desc("Cost of jump instructions."),
148 cl::init(1), cl::Hidden);
149static cl::opt<bool>
150TailDupPlacement("tail-dup-placement",
151 cl::desc("Perform tail duplication during placement. "
152 "Creates more fallthrough opportunites in "
153 "outline branches."),
154 cl::init(true), cl::Hidden);
155
156static cl::opt<bool>
157BranchFoldPlacement("branch-fold-placement",
158 cl::desc("Perform branch folding during placement. "
159 "Reduces code size."),
160 cl::init(true), cl::Hidden);
161
162// Heuristic for tail duplication.
164 "tail-dup-placement-threshold",
165 cl::desc("Instruction cutoff for tail duplication during layout. "
166 "Tail merging during layout is forced to have a threshold "
167 "that won't conflict."), cl::init(2),
168 cl::Hidden);
169
170// Heuristic for aggressive tail duplication.
172 "tail-dup-placement-aggressive-threshold",
173 cl::desc("Instruction cutoff for aggressive tail duplication during "
174 "layout. Used at -O3. Tail merging during layout is forced to "
175 "have a threshold that won't conflict."), cl::init(4),
176 cl::Hidden);
177
178// Heuristic for tail duplication.
180 "tail-dup-placement-penalty",
181 cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
182 "Copying can increase fallthrough, but it also increases icache "
183 "pressure. This parameter controls the penalty to account for that. "
184 "Percent as integer."),
185 cl::init(2),
186 cl::Hidden);
187
188// Heuristic for tail duplication if profile count is used in cost model.
190 "tail-dup-profile-percent-threshold",
191 cl::desc("If profile count information is used in tail duplication cost "
192 "model, the gained fall through number from tail duplication "
193 "should be at least this percent of hot count."),
194 cl::init(50), cl::Hidden);
195
196// Heuristic for triangle chains.
198 "triangle-chain-count",
199 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
200 "triangle tail duplication heuristic to kick in. 0 to disable."),
201 cl::init(2),
202 cl::Hidden);
203
204// Use case: When block layout is visualized after MBP pass, the basic blocks
205// are labeled in layout order; meanwhile blocks could be numbered in a
206// different order. It's hard to map between the graph and pass output.
207// With this option on, the basic blocks are renumbered in function layout
208// order. For debugging only.
210 "renumber-blocks-before-view",
211 cl::desc(
212 "If true, basic blocks are re-numbered before MBP layout is printed "
213 "into a dot graph. Only used when a function is being printed."),
214 cl::init(false), cl::Hidden);
215
216namespace llvm {
221
222// Internal option used to control BFI display only after MBP pass.
223// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
224// -view-block-layout-with-bfi=
226
227// Command line option to specify the name of the function for CFG dump
228// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
230} // namespace llvm
231
232namespace {
233
234class BlockChain;
235
236/// Type for our function-wide basic block -> block chain mapping.
237using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
238
239/// A chain of blocks which will be laid out contiguously.
240///
241/// This is the datastructure representing a chain of consecutive blocks that
242/// are profitable to layout together in order to maximize fallthrough
243/// probabilities and code locality. We also can use a block chain to represent
244/// a sequence of basic blocks which have some external (correctness)
245/// requirement for sequential layout.
246///
247/// Chains can be built around a single basic block and can be merged to grow
248/// them. They participate in a block-to-chain mapping, which is updated
249/// automatically as chains are merged together.
250class BlockChain {
251 /// The sequence of blocks belonging to this chain.
252 ///
253 /// This is the sequence of blocks for a particular chain. These will be laid
254 /// out in-order within the function.
256
257 /// A handle to the function-wide basic block to block chain mapping.
258 ///
259 /// This is retained in each block chain to simplify the computation of child
260 /// block chains for SCC-formation and iteration. We store the edges to child
261 /// basic blocks, and map them back to their associated chains using this
262 /// structure.
263 BlockToChainMapType &BlockToChain;
264
265public:
266 /// Construct a new BlockChain.
267 ///
268 /// This builds a new block chain representing a single basic block in the
269 /// function. It also registers itself as the chain that block participates
270 /// in with the BlockToChain mapping.
271 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
272 : Blocks(1, BB), BlockToChain(BlockToChain) {
273 assert(BB && "Cannot create a chain with a null basic block");
274 BlockToChain[BB] = this;
275 }
276
277 /// Iterator over blocks within the chain.
280
281 /// Beginning of blocks within the chain.
282 iterator begin() { return Blocks.begin(); }
283 const_iterator begin() const { return Blocks.begin(); }
284
285 /// End of blocks within the chain.
286 iterator end() { return Blocks.end(); }
287 const_iterator end() const { return Blocks.end(); }
288
289 bool remove(MachineBasicBlock* BB) {
290 for(iterator i = begin(); i != end(); ++i) {
291 if (*i == BB) {
292 Blocks.erase(i);
293 return true;
294 }
295 }
296 return false;
297 }
298
299 /// Merge a block chain into this one.
300 ///
301 /// This routine merges a block chain into this one. It takes care of forming
302 /// a contiguous sequence of basic blocks, updating the edge list, and
303 /// updating the block -> chain mapping. It does not free or tear down the
304 /// old chain, but the old chain's block list is no longer valid.
305 void merge(MachineBasicBlock *BB, BlockChain *Chain) {
306 assert(BB && "Can't merge a null block.");
307 assert(!Blocks.empty() && "Can't merge into an empty chain.");
308
309 // Fast path in case we don't have a chain already.
310 if (!Chain) {
311 assert(!BlockToChain[BB] &&
312 "Passed chain is null, but BB has entry in BlockToChain.");
313 Blocks.push_back(BB);
314 BlockToChain[BB] = this;
315 return;
316 }
317
318 assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
319 assert(Chain->begin() != Chain->end());
320
321 // Update the incoming blocks to point to this chain, and add them to the
322 // chain structure.
323 for (MachineBasicBlock *ChainBB : *Chain) {
324 Blocks.push_back(ChainBB);
325 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
326 BlockToChain[ChainBB] = this;
327 }
328 }
329
330#ifndef NDEBUG
331 /// Dump the blocks in this chain.
332 LLVM_DUMP_METHOD void dump() {
333 for (MachineBasicBlock *MBB : *this)
334 MBB->dump();
335 }
336#endif // NDEBUG
337
338 /// Count of predecessors of any block within the chain which have not
339 /// yet been scheduled. In general, we will delay scheduling this chain
340 /// until those predecessors are scheduled (or we find a sufficiently good
341 /// reason to override this heuristic.) Note that when forming loop chains,
342 /// blocks outside the loop are ignored and treated as if they were already
343 /// scheduled.
344 ///
345 /// Note: This field is reinitialized multiple times - once for each loop,
346 /// and then once for the function as a whole.
347 unsigned UnscheduledPredecessors = 0;
348};
349
350class MachineBlockPlacement : public MachineFunctionPass {
351 /// A type for a block filter set.
353
354 /// Pair struct containing basic block and taildup profitability
355 struct BlockAndTailDupResult {
356 MachineBasicBlock *BB = nullptr;
357 bool ShouldTailDup;
358 };
359
360 /// Triple struct containing edge weight and the edge.
361 struct WeightedEdge {
362 BlockFrequency Weight;
363 MachineBasicBlock *Src = nullptr;
364 MachineBasicBlock *Dest = nullptr;
365 };
366
367 /// work lists of blocks that are ready to be laid out
370
371 /// Edges that have already been computed as optimal.
373
374 /// Machine Function
375 MachineFunction *F = nullptr;
376
377 /// A handle to the branch probability pass.
378 const MachineBranchProbabilityInfo *MBPI = nullptr;
379
380 /// A handle to the function-wide block frequency pass.
381 std::unique_ptr<MBFIWrapper> MBFI;
382
383 /// A handle to the loop info.
384 MachineLoopInfo *MLI = nullptr;
385
386 /// Preferred loop exit.
387 /// Member variable for convenience. It may be removed by duplication deep
388 /// in the call stack.
389 MachineBasicBlock *PreferredLoopExit = nullptr;
390
391 /// A handle to the target's instruction info.
392 const TargetInstrInfo *TII = nullptr;
393
394 /// A handle to the target's lowering info.
395 const TargetLoweringBase *TLI = nullptr;
396
397 /// A handle to the post dominator tree.
398 MachinePostDominatorTree *MPDT = nullptr;
399
400 ProfileSummaryInfo *PSI = nullptr;
401
402 /// Duplicator used to duplicate tails during placement.
403 ///
404 /// Placement decisions can open up new tail duplication opportunities, but
405 /// since tail duplication affects placement decisions of later blocks, it
406 /// must be done inline.
407 TailDuplicator TailDup;
408
409 /// Partial tail duplication threshold.
410 BlockFrequency DupThreshold;
411
412 /// True: use block profile count to compute tail duplication cost.
413 /// False: use block frequency to compute tail duplication cost.
414 bool UseProfileCount = false;
415
416 /// Allocator and owner of BlockChain structures.
417 ///
418 /// We build BlockChains lazily while processing the loop structure of
419 /// a function. To reduce malloc traffic, we allocate them using this
420 /// slab-like allocator, and destroy them after the pass completes. An
421 /// important guarantee is that this allocator produces stable pointers to
422 /// the chains.
424
425 /// Function wide BasicBlock to BlockChain mapping.
426 ///
427 /// This mapping allows efficiently moving from any given basic block to the
428 /// BlockChain it participates in, if any. We use it to, among other things,
429 /// allow implicitly defining edges between chains as the existing edges
430 /// between basic blocks.
432
433#ifndef NDEBUG
434 /// The set of basic blocks that have terminators that cannot be fully
435 /// analyzed. These basic blocks cannot be re-ordered safely by
436 /// MachineBlockPlacement, and we must preserve physical layout of these
437 /// blocks and their successors through the pass.
438 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
439#endif
440
441 /// Get block profile count or frequency according to UseProfileCount.
442 /// The return value is used to model tail duplication cost.
443 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
444 if (UseProfileCount) {
445 auto Count = MBFI->getBlockProfileCount(BB);
446 if (Count)
447 return *Count;
448 else
449 return 0;
450 } else
451 return MBFI->getBlockFreq(BB);
452 }
453
454 /// Scale the DupThreshold according to basic block size.
455 BlockFrequency scaleThreshold(MachineBasicBlock *BB);
456 void initDupThreshold();
457
458 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
459 /// if the count goes to 0, add them to the appropriate work list.
460 void markChainSuccessors(
461 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
462 const BlockFilterSet *BlockFilter = nullptr);
463
464 /// Decrease the UnscheduledPredecessors count for a single block, and
465 /// if the count goes to 0, add them to the appropriate work list.
466 void markBlockSuccessors(
467 const BlockChain &Chain, const MachineBasicBlock *BB,
468 const MachineBasicBlock *LoopHeaderBB,
469 const BlockFilterSet *BlockFilter = nullptr);
470
472 collectViableSuccessors(
473 const MachineBasicBlock *BB, const BlockChain &Chain,
474 const BlockFilterSet *BlockFilter,
476 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
477 BlockFilterSet *BlockFilter);
478 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
480 BlockFilterSet *BlockFilter);
481 bool repeatedlyTailDuplicateBlock(
483 const MachineBasicBlock *LoopHeaderBB,
484 BlockChain &Chain, BlockFilterSet *BlockFilter,
485 MachineFunction::iterator &PrevUnplacedBlockIt);
486 bool maybeTailDuplicateBlock(
488 BlockChain &Chain, BlockFilterSet *BlockFilter,
489 MachineFunction::iterator &PrevUnplacedBlockIt,
490 bool &DuplicatedToLPred);
491 bool hasBetterLayoutPredecessor(
492 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
493 const BlockChain &SuccChain, BranchProbability SuccProb,
494 BranchProbability RealSuccProb, const BlockChain &Chain,
495 const BlockFilterSet *BlockFilter);
496 BlockAndTailDupResult selectBestSuccessor(
497 const MachineBasicBlock *BB, const BlockChain &Chain,
498 const BlockFilterSet *BlockFilter);
499 MachineBasicBlock *selectBestCandidateBlock(
500 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
501 MachineBasicBlock *getFirstUnplacedBlock(
502 const BlockChain &PlacedChain,
503 MachineFunction::iterator &PrevUnplacedBlockIt,
504 const BlockFilterSet *BlockFilter);
505
506 /// Add a basic block to the work list if it is appropriate.
507 ///
508 /// If the optional parameter BlockFilter is provided, only MBB
509 /// present in the set will be added to the worklist. If nullptr
510 /// is provided, no filtering occurs.
511 void fillWorkLists(const MachineBasicBlock *MBB,
512 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
513 const BlockFilterSet *BlockFilter);
514
515 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
516 BlockFilterSet *BlockFilter = nullptr);
517 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
518 const MachineBasicBlock *OldTop);
519 bool hasViableTopFallthrough(const MachineBasicBlock *Top,
520 const BlockFilterSet &LoopBlockSet);
521 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
522 const BlockFilterSet &LoopBlockSet);
523 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
524 const MachineBasicBlock *OldTop,
525 const MachineBasicBlock *ExitBB,
526 const BlockFilterSet &LoopBlockSet);
527 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
528 const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
529 MachineBasicBlock *findBestLoopTop(
530 const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
531 MachineBasicBlock *findBestLoopExit(
532 const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
533 BlockFrequency &ExitFreq);
534 BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
535 void buildLoopChains(const MachineLoop &L);
536 void rotateLoop(
537 BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
538 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
539 void rotateLoopWithProfile(
540 BlockChain &LoopChain, const MachineLoop &L,
541 const BlockFilterSet &LoopBlockSet);
542 void buildCFGChains();
543 void optimizeBranches();
544 void alignBlocks();
545 /// Returns true if a block should be tail-duplicated to increase fallthrough
546 /// opportunities.
547 bool shouldTailDuplicate(MachineBasicBlock *BB);
548 /// Check the edge frequencies to see if tail duplication will increase
549 /// fallthroughs.
550 bool isProfitableToTailDup(
551 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
552 BranchProbability QProb,
553 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
554
555 /// Check for a trellis layout.
556 bool isTrellis(const MachineBasicBlock *BB,
557 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
558 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
559
560 /// Get the best successor given a trellis layout.
561 BlockAndTailDupResult getBestTrellisSuccessor(
562 const MachineBasicBlock *BB,
563 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
564 BranchProbability AdjustedSumProb, const BlockChain &Chain,
565 const BlockFilterSet *BlockFilter);
566
567 /// Get the best pair of non-conflicting edges.
568 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
569 const MachineBasicBlock *BB,
571
572 /// Returns true if a block can tail duplicate into all unplaced
573 /// predecessors. Filters based on loop.
574 bool canTailDuplicateUnplacedPreds(
575 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
576 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
577
578 /// Find chains of triangles to tail-duplicate where a global analysis works,
579 /// but a local analysis would not find them.
580 void precomputeTriangleChains();
581
582 /// Apply a post-processing step optimizing block placement.
583 void applyExtTsp();
584
585 /// Modify the existing block placement in the function and adjust all jumps.
586 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
587
588 /// Create a single CFG chain from the current block order.
589 void createCFGChainExtTsp();
590
591public:
592 static char ID; // Pass identification, replacement for typeid
593
594 MachineBlockPlacement() : MachineFunctionPass(ID) {
596 }
597
598 bool runOnMachineFunction(MachineFunction &F) override;
599
600 bool allowTailDupPlacement() const {
601 assert(F);
602 return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
603 }
604
605 void getAnalysisUsage(AnalysisUsage &AU) const override {
614 }
615};
616
617} // end anonymous namespace
618
619char MachineBlockPlacement::ID = 0;
620
621char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
622
623INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
624 "Branch Probability Basic Block Placement", false, false)
630INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
631 "Branch Probability Basic Block Placement", false, false)
632
633#ifndef NDEBUG
634/// Helper to print the name of a MBB.
635///
636/// Only used by debug logging.
637static std::string getBlockName(const MachineBasicBlock *BB) {
638 std::string Result;
639 raw_string_ostream OS(Result);
640 OS << printMBBReference(*BB);
641 OS << " ('" << BB->getName() << "')";
642 OS.flush();
643 return Result;
644}
645#endif
646
647/// Mark a chain's successors as having one fewer preds.
648///
649/// When a chain is being merged into the "placed" chain, this routine will
650/// quickly walk the successors of each block in the chain and mark them as
651/// having one fewer active predecessor. It also adds any successors of this
652/// chain which reach the zero-predecessor state to the appropriate worklist.
653void MachineBlockPlacement::markChainSuccessors(
654 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
655 const BlockFilterSet *BlockFilter) {
656 // Walk all the blocks in this chain, marking their successors as having
657 // a predecessor placed.
658 for (MachineBasicBlock *MBB : Chain) {
659 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
660 }
661}
662
663/// Mark a single block's successors as having one fewer preds.
664///
665/// Under normal circumstances, this is only called by markChainSuccessors,
666/// but if a block that was to be placed is completely tail-duplicated away,
667/// and was duplicated into the chain end, we need to redo markBlockSuccessors
668/// for just that block.
669void MachineBlockPlacement::markBlockSuccessors(
670 const BlockChain &Chain, const MachineBasicBlock *MBB,
671 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
672 // Add any successors for which this is the only un-placed in-loop
673 // predecessor to the worklist as a viable candidate for CFG-neutral
674 // placement. No subsequent placement of this block will violate the CFG
675 // shape, so we get to use heuristics to choose a favorable placement.
676 for (MachineBasicBlock *Succ : MBB->successors()) {
677 if (BlockFilter && !BlockFilter->count(Succ))
678 continue;
679 BlockChain &SuccChain = *BlockToChain[Succ];
680 // Disregard edges within a fixed chain, or edges to the loop header.
681 if (&Chain == &SuccChain || Succ == LoopHeaderBB)
682 continue;
683
684 // This is a cross-chain edge that is within the loop, so decrement the
685 // loop predecessor count of the destination chain.
686 if (SuccChain.UnscheduledPredecessors == 0 ||
687 --SuccChain.UnscheduledPredecessors > 0)
688 continue;
689
690 auto *NewBB = *SuccChain.begin();
691 if (NewBB->isEHPad())
692 EHPadWorkList.push_back(NewBB);
693 else
694 BlockWorkList.push_back(NewBB);
695 }
696}
697
698/// This helper function collects the set of successors of block
699/// \p BB that are allowed to be its layout successors, and return
700/// the total branch probability of edges from \p BB to those
701/// blocks.
702BranchProbability MachineBlockPlacement::collectViableSuccessors(
703 const MachineBasicBlock *BB, const BlockChain &Chain,
704 const BlockFilterSet *BlockFilter,
706 // Adjust edge probabilities by excluding edges pointing to blocks that is
707 // either not in BlockFilter or is already in the current chain. Consider the
708 // following CFG:
709 //
710 // --->A
711 // | / \
712 // | B C
713 // | \ / \
714 // ----D E
715 //
716 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
717 // A->C is chosen as a fall-through, D won't be selected as a successor of C
718 // due to CFG constraint (the probability of C->D is not greater than
719 // HotProb to break topo-order). If we exclude E that is not in BlockFilter
720 // when calculating the probability of C->D, D will be selected and we
721 // will get A C D B as the layout of this loop.
722 auto AdjustedSumProb = BranchProbability::getOne();
723 for (MachineBasicBlock *Succ : BB->successors()) {
724 bool SkipSucc = false;
725 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
726 SkipSucc = true;
727 } else {
728 BlockChain *SuccChain = BlockToChain[Succ];
729 if (SuccChain == &Chain) {
730 SkipSucc = true;
731 } else if (Succ != *SuccChain->begin()) {
732 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
733 << " -> Mid chain!\n");
734 continue;
735 }
736 }
737 if (SkipSucc)
738 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
739 else
740 Successors.push_back(Succ);
741 }
742
743 return AdjustedSumProb;
744}
745
746/// The helper function returns the branch probability that is adjusted
747/// or normalized over the new total \p AdjustedSumProb.
750 BranchProbability AdjustedSumProb) {
751 BranchProbability SuccProb;
752 uint32_t SuccProbN = OrigProb.getNumerator();
753 uint32_t SuccProbD = AdjustedSumProb.getNumerator();
754 if (SuccProbN >= SuccProbD)
755 SuccProb = BranchProbability::getOne();
756 else
757 SuccProb = BranchProbability(SuccProbN, SuccProbD);
758
759 return SuccProb;
760}
761
762/// Check if \p BB has exactly the successors in \p Successors.
763static bool
766 if (BB.succ_size() != Successors.size())
767 return false;
768 // We don't want to count self-loops
769 if (Successors.count(&BB))
770 return false;
771 for (MachineBasicBlock *Succ : BB.successors())
772 if (!Successors.count(Succ))
773 return false;
774 return true;
775}
776
777/// Check if a block should be tail duplicated to increase fallthrough
778/// opportunities.
779/// \p BB Block to check.
780bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
781 // Blocks with single successors don't create additional fallthrough
782 // opportunities. Don't duplicate them. TODO: When conditional exits are
783 // analyzable, allow them to be duplicated.
784 bool IsSimple = TailDup.isSimpleBB(BB);
785
786 if (BB->succ_size() == 1)
787 return false;
788 return TailDup.shouldTailDuplicate(IsSimple, *BB);
789}
790
791/// Compare 2 BlockFrequency's with a small penalty for \p A.
792/// In order to be conservative, we apply a X% penalty to account for
793/// increased icache pressure and static heuristics. For small frequencies
794/// we use only the numerators to improve accuracy. For simplicity, we assume the
795/// penalty is less than 100%
796/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
798 uint64_t EntryFreq) {
799 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
800 BlockFrequency Gain = A - B;
801 return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
802}
803
804/// Check the edge frequencies to see if tail duplication will increase
805/// fallthroughs. It only makes sense to call this function when
806/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
807/// always locally profitable if we would have picked \p Succ without
808/// considering duplication.
809bool MachineBlockPlacement::isProfitableToTailDup(
810 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
811 BranchProbability QProb,
812 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
813 // We need to do a probability calculation to make sure this is profitable.
814 // First: does succ have a successor that post-dominates? This affects the
815 // calculation. The 2 relevant cases are:
816 // BB BB
817 // | \Qout | \Qout
818 // P| C |P C
819 // = C' = C'
820 // | /Qin | /Qin
821 // | / | /
822 // Succ Succ
823 // / \ | \ V
824 // U/ =V |U \
825 // / \ = D
826 // D E | /
827 // | /
828 // |/
829 // PDom
830 // '=' : Branch taken for that CFG edge
831 // In the second case, Placing Succ while duplicating it into C prevents the
832 // fallthrough of Succ into either D or PDom, because they now have C as an
833 // unplaced predecessor
834
835 // Start by figuring out which case we fall into
836 MachineBasicBlock *PDom = nullptr;
838 // Only scan the relevant successors
839 auto AdjustedSuccSumProb =
840 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
841 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
842 auto BBFreq = MBFI->getBlockFreq(BB);
843 auto SuccFreq = MBFI->getBlockFreq(Succ);
844 BlockFrequency P = BBFreq * PProb;
845 BlockFrequency Qout = BBFreq * QProb;
846 uint64_t EntryFreq = MBFI->getEntryFreq();
847 // If there are no more successors, it is profitable to copy, as it strictly
848 // increases fallthrough.
849 if (SuccSuccs.size() == 0)
850 return greaterWithBias(P, Qout, EntryFreq);
851
852 auto BestSuccSucc = BranchProbability::getZero();
853 // Find the PDom or the best Succ if no PDom exists.
854 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
855 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
856 if (Prob > BestSuccSucc)
857 BestSuccSucc = Prob;
858 if (PDom == nullptr)
859 if (MPDT->dominates(SuccSucc, Succ)) {
860 PDom = SuccSucc;
861 break;
862 }
863 }
864 // For the comparisons, we need to know Succ's best incoming edge that isn't
865 // from BB.
866 auto SuccBestPred = BlockFrequency(0);
867 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
868 if (SuccPred == Succ || SuccPred == BB
869 || BlockToChain[SuccPred] == &Chain
870 || (BlockFilter && !BlockFilter->count(SuccPred)))
871 continue;
872 auto Freq = MBFI->getBlockFreq(SuccPred)
873 * MBPI->getEdgeProbability(SuccPred, Succ);
874 if (Freq > SuccBestPred)
875 SuccBestPred = Freq;
876 }
877 // Qin is Succ's best unplaced incoming edge that isn't BB
878 BlockFrequency Qin = SuccBestPred;
879 // If it doesn't have a post-dominating successor, here is the calculation:
880 // BB BB
881 // | \Qout | \
882 // P| C | =
883 // = C' | C
884 // | /Qin | |
885 // | / | C' (+Succ)
886 // Succ Succ /|
887 // / \ | \/ |
888 // U/ =V | == |
889 // / \ | / \|
890 // D E D E
891 // '=' : Branch taken for that CFG edge
892 // Cost in the first case is: P + V
893 // For this calculation, we always assume P > Qout. If Qout > P
894 // The result of this function will be ignored at the caller.
895 // Let F = SuccFreq - Qin
896 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
897
898 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
899 BranchProbability UProb = BestSuccSucc;
900 BranchProbability VProb = AdjustedSuccSumProb - UProb;
901 BlockFrequency F = SuccFreq - Qin;
902 BlockFrequency V = SuccFreq * VProb;
903 BlockFrequency QinU = std::min(Qin, F) * UProb;
904 BlockFrequency BaseCost = P + V;
905 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
906 return greaterWithBias(BaseCost, DupCost, EntryFreq);
907 }
908 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
909 BranchProbability VProb = AdjustedSuccSumProb - UProb;
910 BlockFrequency U = SuccFreq * UProb;
911 BlockFrequency V = SuccFreq * VProb;
912 BlockFrequency F = SuccFreq - Qin;
913 // If there is a post-dominating successor, here is the calculation:
914 // BB BB BB BB
915 // | \Qout | \ | \Qout | \
916 // |P C | = |P C | =
917 // = C' |P C = C' |P C
918 // | /Qin | | | /Qin | |
919 // | / | C' (+Succ) | / | C' (+Succ)
920 // Succ Succ /| Succ Succ /|
921 // | \ V | \/ | | \ V | \/ |
922 // |U \ |U /\ =? |U = |U /\ |
923 // = D = = =?| | D | = =|
924 // | / |/ D | / |/ D
925 // | / | / | = | /
926 // |/ | / |/ | =
927 // Dom Dom Dom Dom
928 // '=' : Branch taken for that CFG edge
929 // The cost for taken branches in the first case is P + U
930 // Let F = SuccFreq - Qin
931 // The cost in the second case (assuming independence), given the layout:
932 // BB, Succ, (C+Succ), D, Dom or the layout:
933 // BB, Succ, D, Dom, (C+Succ)
934 // is Qout + max(F, Qin) * U + min(F, Qin)
935 // compare P + U vs Qout + P * U + Qin.
936 //
937 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
938 //
939 // For the 3rd case, the cost is P + 2 * V
940 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
941 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
942 if (UProb > AdjustedSuccSumProb / 2 &&
943 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
944 Chain, BlockFilter))
945 // Cases 3 & 4
946 return greaterWithBias(
947 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
948 EntryFreq);
949 // Cases 1 & 2
950 return greaterWithBias((P + U),
951 (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
952 std::max(Qin, F) * UProb),
953 EntryFreq);
954}
955
956/// Check for a trellis layout. \p BB is the upper part of a trellis if its
957/// successors form the lower part of a trellis. A successor set S forms the
958/// lower part of a trellis if all of the predecessors of S are either in S or
959/// have all of S as successors. We ignore trellises where BB doesn't have 2
960/// successors because for fewer than 2, it's trivial, and for 3 or greater they
961/// are very uncommon and complex to compute optimally. Allowing edges within S
962/// is not strictly a trellis, but the same algorithm works, so we allow it.
963bool MachineBlockPlacement::isTrellis(
964 const MachineBasicBlock *BB,
965 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
966 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
967 // Technically BB could form a trellis with branching factor higher than 2.
968 // But that's extremely uncommon.
969 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
970 return false;
971
973 BB->succ_end());
974 // To avoid reviewing the same predecessors twice.
976
977 for (MachineBasicBlock *Succ : ViableSuccs) {
978 int PredCount = 0;
979 for (auto *SuccPred : Succ->predecessors()) {
980 // Allow triangle successors, but don't count them.
981 if (Successors.count(SuccPred)) {
982 // Make sure that it is actually a triangle.
983 for (MachineBasicBlock *CheckSucc : SuccPred->successors())
984 if (!Successors.count(CheckSucc))
985 return false;
986 continue;
987 }
988 const BlockChain *PredChain = BlockToChain[SuccPred];
989 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
990 PredChain == &Chain || PredChain == BlockToChain[Succ])
991 continue;
992 ++PredCount;
993 // Perform the successor check only once.
994 if (!SeenPreds.insert(SuccPred).second)
995 continue;
996 if (!hasSameSuccessors(*SuccPred, Successors))
997 return false;
998 }
999 // If one of the successors has only BB as a predecessor, it is not a
1000 // trellis.
1001 if (PredCount < 1)
1002 return false;
1003 }
1004 return true;
1005}
1006
1007/// Pick the highest total weight pair of edges that can both be laid out.
1008/// The edges in \p Edges[0] are assumed to have a different destination than
1009/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1010/// the individual highest weight edges to the 2 different destinations, or in
1011/// case of a conflict, one of them should be replaced with a 2nd best edge.
1012std::pair<MachineBlockPlacement::WeightedEdge,
1013 MachineBlockPlacement::WeightedEdge>
1014MachineBlockPlacement::getBestNonConflictingEdges(
1015 const MachineBasicBlock *BB,
1017 Edges) {
1018 // Sort the edges, and then for each successor, find the best incoming
1019 // predecessor. If the best incoming predecessors aren't the same,
1020 // then that is clearly the best layout. If there is a conflict, one of the
1021 // successors will have to fallthrough from the second best predecessor. We
1022 // compare which combination is better overall.
1023
1024 // Sort for highest frequency.
1025 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1026
1027 llvm::stable_sort(Edges[0], Cmp);
1028 llvm::stable_sort(Edges[1], Cmp);
1029 auto BestA = Edges[0].begin();
1030 auto BestB = Edges[1].begin();
1031 // Arrange for the correct answer to be in BestA and BestB
1032 // If the 2 best edges don't conflict, the answer is already there.
1033 if (BestA->Src == BestB->Src) {
1034 // Compare the total fallthrough of (Best + Second Best) for both pairs
1035 auto SecondBestA = std::next(BestA);
1036 auto SecondBestB = std::next(BestB);
1037 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1038 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1039 if (BestAScore < BestBScore)
1040 BestA = SecondBestA;
1041 else
1042 BestB = SecondBestB;
1043 }
1044 // Arrange for the BB edge to be in BestA if it exists.
1045 if (BestB->Src == BB)
1046 std::swap(BestA, BestB);
1047 return std::make_pair(*BestA, *BestB);
1048}
1049
1050/// Get the best successor from \p BB based on \p BB being part of a trellis.
1051/// We only handle trellises with 2 successors, so the algorithm is
1052/// straightforward: Find the best pair of edges that don't conflict. We find
1053/// the best incoming edge for each successor in the trellis. If those conflict,
1054/// we consider which of them should be replaced with the second best.
1055/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1056/// comes from \p BB, it will be in \p BestEdges[0]
1057MachineBlockPlacement::BlockAndTailDupResult
1058MachineBlockPlacement::getBestTrellisSuccessor(
1059 const MachineBasicBlock *BB,
1060 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1061 BranchProbability AdjustedSumProb, const BlockChain &Chain,
1062 const BlockFilterSet *BlockFilter) {
1063
1064 BlockAndTailDupResult Result = {nullptr, false};
1066 BB->succ_end());
1067
1068 // We assume size 2 because it's common. For general n, we would have to do
1069 // the Hungarian algorithm, but it's not worth the complexity because more
1070 // than 2 successors is fairly uncommon, and a trellis even more so.
1071 if (Successors.size() != 2 || ViableSuccs.size() != 2)
1072 return Result;
1073
1074 // Collect the edge frequencies of all edges that form the trellis.
1076 int SuccIndex = 0;
1077 for (auto *Succ : ViableSuccs) {
1078 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1079 // Skip any placed predecessors that are not BB
1080 if (SuccPred != BB)
1081 if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1082 BlockToChain[SuccPred] == &Chain ||
1083 BlockToChain[SuccPred] == BlockToChain[Succ])
1084 continue;
1085 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1086 MBPI->getEdgeProbability(SuccPred, Succ);
1087 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1088 }
1089 ++SuccIndex;
1090 }
1091
1092 // Pick the best combination of 2 edges from all the edges in the trellis.
1093 WeightedEdge BestA, BestB;
1094 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1095
1096 if (BestA.Src != BB) {
1097 // If we have a trellis, and BB doesn't have the best fallthrough edges,
1098 // we shouldn't choose any successor. We've already looked and there's a
1099 // better fallthrough edge for all the successors.
1100 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1101 return Result;
1102 }
1103
1104 // Did we pick the triangle edge? If tail-duplication is profitable, do
1105 // that instead. Otherwise merge the triangle edge now while we know it is
1106 // optimal.
1107 if (BestA.Dest == BestB.Src) {
1108 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1109 // would be better.
1110 MachineBasicBlock *Succ1 = BestA.Dest;
1111 MachineBasicBlock *Succ2 = BestB.Dest;
1112 // Check to see if tail-duplication would be profitable.
1113 if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1114 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1115 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1116 Chain, BlockFilter)) {
1118 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1119 dbgs() << " Selected: " << getBlockName(Succ2)
1120 << ", probability: " << Succ2Prob
1121 << " (Tail Duplicate)\n");
1122 Result.BB = Succ2;
1123 Result.ShouldTailDup = true;
1124 return Result;
1125 }
1126 }
1127 // We have already computed the optimal edge for the other side of the
1128 // trellis.
1129 ComputedEdges[BestB.Src] = { BestB.Dest, false };
1130
1131 auto TrellisSucc = BestA.Dest;
1133 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1134 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1135 << ", probability: " << SuccProb << " (Trellis)\n");
1136 Result.BB = TrellisSucc;
1137 return Result;
1138}
1139
1140/// When the option allowTailDupPlacement() is on, this method checks if the
1141/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1142/// into all of its unplaced, unfiltered predecessors, that are not BB.
1143bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1144 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1145 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1146 if (!shouldTailDuplicate(Succ))
1147 return false;
1148
1149 // The result of canTailDuplicate.
1150 bool Duplicate = true;
1151 // Number of possible duplication.
1152 unsigned int NumDup = 0;
1153
1154 // For CFG checking.
1156 BB->succ_end());
1157 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1158 // Make sure all unplaced and unfiltered predecessors can be
1159 // tail-duplicated into.
1160 // Skip any blocks that are already placed or not in this loop.
1161 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1162 || (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
1163 continue;
1164 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1165 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1166 // This will result in a trellis after tail duplication, so we don't
1167 // need to copy Succ into this predecessor. In the presence
1168 // of a trellis tail duplication can continue to be profitable.
1169 // For example:
1170 // A A
1171 // |\ |\
1172 // | \ | \
1173 // | C | C+BB
1174 // | / | |
1175 // |/ | |
1176 // BB => BB |
1177 // |\ |\/|
1178 // | \ |/\|
1179 // | D | D
1180 // | / | /
1181 // |/ |/
1182 // Succ Succ
1183 //
1184 // After BB was duplicated into C, the layout looks like the one on the
1185 // right. BB and C now have the same successors. When considering
1186 // whether Succ can be duplicated into all its unplaced predecessors, we
1187 // ignore C.
1188 // We can do this because C already has a profitable fallthrough, namely
1189 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1190 // duplication and for this test.
1191 //
1192 // This allows trellises to be laid out in 2 separate chains
1193 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1194 // because it allows the creation of 2 fallthrough paths with links
1195 // between them, and we correctly identify the best layout for these
1196 // CFGs. We want to extend trellises that the user created in addition
1197 // to trellises created by tail-duplication, so we just look for the
1198 // CFG.
1199 continue;
1200 Duplicate = false;
1201 continue;
1202 }
1203 NumDup++;
1204 }
1205
1206 // No possible duplication in current filter set.
1207 if (NumDup == 0)
1208 return false;
1209
1210 // If profile information is available, findDuplicateCandidates can do more
1211 // precise benefit analysis.
1212 if (F->getFunction().hasProfileData())
1213 return true;
1214
1215 // This is mainly for function exit BB.
1216 // The integrated tail duplication is really designed for increasing
1217 // fallthrough from predecessors from Succ to its successors. We may need
1218 // other machanism to handle different cases.
1219 if (Succ->succ_empty())
1220 return true;
1221
1222 // Plus the already placed predecessor.
1223 NumDup++;
1224
1225 // If the duplication candidate has more unplaced predecessors than
1226 // successors, the extra duplication can't bring more fallthrough.
1227 //
1228 // Pred1 Pred2 Pred3
1229 // \ | /
1230 // \ | /
1231 // \ | /
1232 // Dup
1233 // / \
1234 // / \
1235 // Succ1 Succ2
1236 //
1237 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1238 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1239 // but the duplication into Pred3 can't increase fallthrough.
1240 //
1241 // A small number of extra duplication may not hurt too much. We need a better
1242 // heuristic to handle it.
1243 if ((NumDup > Succ->succ_size()) || !Duplicate)
1244 return false;
1245
1246 return true;
1247}
1248
1249/// Find chains of triangles where we believe it would be profitable to
1250/// tail-duplicate them all, but a local analysis would not find them.
1251/// There are 3 ways this can be profitable:
1252/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1253/// longer chains)
1254/// 2) The chains are statically correlated. Branch probabilities have a very
1255/// U-shaped distribution.
1256/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1257/// If the branches in a chain are likely to be from the same side of the
1258/// distribution as their predecessor, but are independent at runtime, this
1259/// transformation is profitable. (Because the cost of being wrong is a small
1260/// fixed cost, unlike the standard triangle layout where the cost of being
1261/// wrong scales with the # of triangles.)
1262/// 3) The chains are dynamically correlated. If the probability that a previous
1263/// branch was taken positively influences whether the next branch will be
1264/// taken
1265/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1266void MachineBlockPlacement::precomputeTriangleChains() {
1267 struct TriangleChain {
1268 std::vector<MachineBasicBlock *> Edges;
1269
1270 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1271 : Edges({src, dst}) {}
1272
1273 void append(MachineBasicBlock *dst) {
1274 assert(getKey()->isSuccessor(dst) &&
1275 "Attempting to append a block that is not a successor.");
1276 Edges.push_back(dst);
1277 }
1278
1279 unsigned count() const { return Edges.size() - 1; }
1280
1281 MachineBasicBlock *getKey() const {
1282 return Edges.back();
1283 }
1284 };
1285
1286 if (TriangleChainCount == 0)
1287 return;
1288
1289 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1290 // Map from last block to the chain that contains it. This allows us to extend
1291 // chains as we find new triangles.
1293 for (MachineBasicBlock &BB : *F) {
1294 // If BB doesn't have 2 successors, it doesn't start a triangle.
1295 if (BB.succ_size() != 2)
1296 continue;
1297 MachineBasicBlock *PDom = nullptr;
1298 for (MachineBasicBlock *Succ : BB.successors()) {
1299 if (!MPDT->dominates(Succ, &BB))
1300 continue;
1301 PDom = Succ;
1302 break;
1303 }
1304 // If BB doesn't have a post-dominating successor, it doesn't form a
1305 // triangle.
1306 if (PDom == nullptr)
1307 continue;
1308 // If PDom has a hint that it is low probability, skip this triangle.
1309 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1310 continue;
1311 // If PDom isn't eligible for duplication, this isn't the kind of triangle
1312 // we're looking for.
1313 if (!shouldTailDuplicate(PDom))
1314 continue;
1315 bool CanTailDuplicate = true;
1316 // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1317 // isn't the kind of triangle we're looking for.
1318 for (MachineBasicBlock* Pred : PDom->predecessors()) {
1319 if (Pred == &BB)
1320 continue;
1321 if (!TailDup.canTailDuplicate(PDom, Pred)) {
1322 CanTailDuplicate = false;
1323 break;
1324 }
1325 }
1326 // If we can't tail-duplicate PDom to its predecessors, then skip this
1327 // triangle.
1328 if (!CanTailDuplicate)
1329 continue;
1330
1331 // Now we have an interesting triangle. Insert it if it's not part of an
1332 // existing chain.
1333 // Note: This cannot be replaced with a call insert() or emplace() because
1334 // the find key is BB, but the insert/emplace key is PDom.
1335 auto Found = TriangleChainMap.find(&BB);
1336 // If it is, remove the chain from the map, grow it, and put it back in the
1337 // map with the end as the new key.
1338 if (Found != TriangleChainMap.end()) {
1339 TriangleChain Chain = std::move(Found->second);
1340 TriangleChainMap.erase(Found);
1341 Chain.append(PDom);
1342 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1343 } else {
1344 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1345 assert(InsertResult.second && "Block seen twice.");
1346 (void)InsertResult;
1347 }
1348 }
1349
1350 // Iterating over a DenseMap is safe here, because the only thing in the body
1351 // of the loop is inserting into another DenseMap (ComputedEdges).
1352 // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1353 for (auto &ChainPair : TriangleChainMap) {
1354 TriangleChain &Chain = ChainPair.second;
1355 // Benchmarking has shown that due to branch correlation duplicating 2 or
1356 // more triangles is profitable, despite the calculations assuming
1357 // independence.
1358 if (Chain.count() < TriangleChainCount)
1359 continue;
1360 MachineBasicBlock *dst = Chain.Edges.back();
1361 Chain.Edges.pop_back();
1362 for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1363 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1364 << getBlockName(dst)
1365 << " as pre-computed based on triangles.\n");
1366
1367 auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1368 assert(InsertResult.second && "Block seen twice.");
1369 (void)InsertResult;
1370
1371 dst = src;
1372 }
1373 }
1374}
1375
1376// When profile is not present, return the StaticLikelyProb.
1377// When profile is available, we need to handle the triangle-shape CFG.
1379 const MachineBasicBlock *BB) {
1380 if (!BB->getParent()->getFunction().hasProfileData())
1382 if (BB->succ_size() == 2) {
1383 const MachineBasicBlock *Succ1 = *BB->succ_begin();
1384 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1385 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1386 /* See case 1 below for the cost analysis. For BB->Succ to
1387 * be taken with smaller cost, the following needs to hold:
1388 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1389 * So the threshold T in the calculation below
1390 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1391 * So T / (1 - T) = 2, Yielding T = 2/3
1392 * Also adding user specified branch bias, we have
1393 * T = (2/3)*(ProfileLikelyProb/50)
1394 * = (2*ProfileLikelyProb)/150)
1395 */
1396 return BranchProbability(2 * ProfileLikelyProb, 150);
1397 }
1398 }
1400}
1401
1402/// Checks to see if the layout candidate block \p Succ has a better layout
1403/// predecessor than \c BB. If yes, returns true.
1404/// \p SuccProb: The probability adjusted for only remaining blocks.
1405/// Only used for logging
1406/// \p RealSuccProb: The un-adjusted probability.
1407/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1408/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1409/// considered
1410bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1411 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1412 const BlockChain &SuccChain, BranchProbability SuccProb,
1413 BranchProbability RealSuccProb, const BlockChain &Chain,
1414 const BlockFilterSet *BlockFilter) {
1415
1416 // There isn't a better layout when there are no unscheduled predecessors.
1417 if (SuccChain.UnscheduledPredecessors == 0)
1418 return false;
1419
1420 // There are two basic scenarios here:
1421 // -------------------------------------
1422 // Case 1: triangular shape CFG (if-then):
1423 // BB
1424 // | \
1425 // | \
1426 // | Pred
1427 // | /
1428 // Succ
1429 // In this case, we are evaluating whether to select edge -> Succ, e.g.
1430 // set Succ as the layout successor of BB. Picking Succ as BB's
1431 // successor breaks the CFG constraints (FIXME: define these constraints).
1432 // With this layout, Pred BB
1433 // is forced to be outlined, so the overall cost will be cost of the
1434 // branch taken from BB to Pred, plus the cost of back taken branch
1435 // from Pred to Succ, as well as the additional cost associated
1436 // with the needed unconditional jump instruction from Pred To Succ.
1437
1438 // The cost of the topological order layout is the taken branch cost
1439 // from BB to Succ, so to make BB->Succ a viable candidate, the following
1440 // must hold:
1441 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1442 // < freq(BB->Succ) * taken_branch_cost.
1443 // Ignoring unconditional jump cost, we get
1444 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1445 // prob(BB->Succ) > 2 * prob(BB->Pred)
1446 //
1447 // When real profile data is available, we can precisely compute the
1448 // probability threshold that is needed for edge BB->Succ to be considered.
1449 // Without profile data, the heuristic requires the branch bias to be
1450 // a lot larger to make sure the signal is very strong (e.g. 80% default).
1451 // -----------------------------------------------------------------
1452 // Case 2: diamond like CFG (if-then-else):
1453 // S
1454 // / \
1455 // | \
1456 // BB Pred
1457 // \ /
1458 // Succ
1459 // ..
1460 //
1461 // The current block is BB and edge BB->Succ is now being evaluated.
1462 // Note that edge S->BB was previously already selected because
1463 // prob(S->BB) > prob(S->Pred).
1464 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1465 // choose Pred, we will have a topological ordering as shown on the left
1466 // in the picture below. If we choose Succ, we have the solution as shown
1467 // on the right:
1468 //
1469 // topo-order:
1470 //
1471 // S----- ---S
1472 // | | | |
1473 // ---BB | | BB
1474 // | | | |
1475 // | Pred-- | Succ--
1476 // | | | |
1477 // ---Succ ---Pred--
1478 //
1479 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1480 // = freq(S->Pred) + freq(S->BB)
1481 //
1482 // If we have profile data (i.e, branch probabilities can be trusted), the
1483 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1484 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1485 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1486 // means the cost of topological order is greater.
1487 // When profile data is not available, however, we need to be more
1488 // conservative. If the branch prediction is wrong, breaking the topo-order
1489 // will actually yield a layout with large cost. For this reason, we need
1490 // strong biased branch at block S with Prob(S->BB) in order to select
1491 // BB->Succ. This is equivalent to looking the CFG backward with backward
1492 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1493 // profile data).
1494 // --------------------------------------------------------------------------
1495 // Case 3: forked diamond
1496 // S
1497 // / \
1498 // / \
1499 // BB Pred
1500 // | \ / |
1501 // | \ / |
1502 // | X |
1503 // | / \ |
1504 // | / \ |
1505 // S1 S2
1506 //
1507 // The current block is BB and edge BB->S1 is now being evaluated.
1508 // As above S->BB was already selected because
1509 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1510 //
1511 // topo-order:
1512 //
1513 // S-------| ---S
1514 // | | | |
1515 // ---BB | | BB
1516 // | | | |
1517 // | Pred----| | S1----
1518 // | | | |
1519 // --(S1 or S2) ---Pred--
1520 // |
1521 // S2
1522 //
1523 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1524 // + min(freq(Pred->S1), freq(Pred->S2))
1525 // Non-topo-order cost:
1526 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1527 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1528 // is 0. Then the non topo layout is better when
1529 // freq(S->Pred) < freq(BB->S1).
1530 // This is exactly what is checked below.
1531 // Note there are other shapes that apply (Pred may not be a single block,
1532 // but they all fit this general pattern.)
1534
1535 // Make sure that a hot successor doesn't have a globally more
1536 // important predecessor.
1537 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1538 bool BadCFGConflict = false;
1539
1540 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1541 BlockChain *PredChain = BlockToChain[Pred];
1542 if (Pred == Succ || PredChain == &SuccChain ||
1543 (BlockFilter && !BlockFilter->count(Pred)) ||
1544 PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
1545 // This check is redundant except for look ahead. This function is
1546 // called for lookahead by isProfitableToTailDup when BB hasn't been
1547 // placed yet.
1548 (Pred == BB))
1549 continue;
1550 // Do backward checking.
1551 // For all cases above, we need a backward checking to filter out edges that
1552 // are not 'strongly' biased.
1553 // BB Pred
1554 // \ /
1555 // Succ
1556 // We select edge BB->Succ if
1557 // freq(BB->Succ) > freq(Succ) * HotProb
1558 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1559 // HotProb
1560 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1561 // Case 1 is covered too, because the first equation reduces to:
1562 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1563 BlockFrequency PredEdgeFreq =
1564 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1565 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1566 BadCFGConflict = true;
1567 break;
1568 }
1569 }
1570
1571 if (BadCFGConflict) {
1572 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1573 << SuccProb << " (prob) (non-cold CFG conflict)\n");
1574 return true;
1575 }
1576
1577 return false;
1578}
1579
1580/// Select the best successor for a block.
1581///
1582/// This looks across all successors of a particular block and attempts to
1583/// select the "best" one to be the layout successor. It only considers direct
1584/// successors which also pass the block filter. It will attempt to avoid
1585/// breaking CFG structure, but cave and break such structures in the case of
1586/// very hot successor edges.
1587///
1588/// \returns The best successor block found, or null if none are viable, along
1589/// with a boolean indicating if tail duplication is necessary.
1590MachineBlockPlacement::BlockAndTailDupResult
1591MachineBlockPlacement::selectBestSuccessor(
1592 const MachineBasicBlock *BB, const BlockChain &Chain,
1593 const BlockFilterSet *BlockFilter) {
1594 const BranchProbability HotProb(StaticLikelyProb, 100);
1595
1596 BlockAndTailDupResult BestSucc = { nullptr, false };
1597 auto BestProb = BranchProbability::getZero();
1598
1600 auto AdjustedSumProb =
1601 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1602
1603 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1604 << "\n");
1605
1606 // if we already precomputed the best successor for BB, return that if still
1607 // applicable.
1608 auto FoundEdge = ComputedEdges.find(BB);
1609 if (FoundEdge != ComputedEdges.end()) {
1610 MachineBasicBlock *Succ = FoundEdge->second.BB;
1611 ComputedEdges.erase(FoundEdge);
1612 BlockChain *SuccChain = BlockToChain[Succ];
1613 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1614 SuccChain != &Chain && Succ == *SuccChain->begin())
1615 return FoundEdge->second;
1616 }
1617
1618 // if BB is part of a trellis, Use the trellis to determine the optimal
1619 // fallthrough edges
1620 if (isTrellis(BB, Successors, Chain, BlockFilter))
1621 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1622 BlockFilter);
1623
1624 // For blocks with CFG violations, we may be able to lay them out anyway with
1625 // tail-duplication. We keep this vector so we can perform the probability
1626 // calculations the minimum number of times.
1628 DupCandidates;
1629 for (MachineBasicBlock *Succ : Successors) {
1630 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1631 BranchProbability SuccProb =
1632 getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1633
1634 BlockChain &SuccChain = *BlockToChain[Succ];
1635 // Skip the edge \c BB->Succ if block \c Succ has a better layout
1636 // predecessor that yields lower global cost.
1637 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1638 Chain, BlockFilter)) {
1639 // If tail duplication would make Succ profitable, place it.
1640 if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1641 DupCandidates.emplace_back(SuccProb, Succ);
1642 continue;
1643 }
1644
1645 LLVM_DEBUG(
1646 dbgs() << " Candidate: " << getBlockName(Succ)
1647 << ", probability: " << SuccProb
1648 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1649 << "\n");
1650
1651 if (BestSucc.BB && BestProb >= SuccProb) {
1652 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1653 continue;
1654 }
1655
1656 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1657 BestSucc.BB = Succ;
1658 BestProb = SuccProb;
1659 }
1660 // Handle the tail duplication candidates in order of decreasing probability.
1661 // Stop at the first one that is profitable. Also stop if they are less
1662 // profitable than BestSucc. Position is important because we preserve it and
1663 // prefer first best match. Here we aren't comparing in order, so we capture
1664 // the position instead.
1665 llvm::stable_sort(DupCandidates,
1666 [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1667 std::tuple<BranchProbability, MachineBasicBlock *> R) {
1668 return std::get<0>(L) > std::get<0>(R);
1669 });
1670 for (auto &Tup : DupCandidates) {
1671 BranchProbability DupProb;
1672 MachineBasicBlock *Succ;
1673 std::tie(DupProb, Succ) = Tup;
1674 if (DupProb < BestProb)
1675 break;
1676 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1677 && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1678 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1679 << ", probability: " << DupProb
1680 << " (Tail Duplicate)\n");
1681 BestSucc.BB = Succ;
1682 BestSucc.ShouldTailDup = true;
1683 break;
1684 }
1685 }
1686
1687 if (BestSucc.BB)
1688 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1689
1690 return BestSucc;
1691}
1692
1693/// Select the best block from a worklist.
1694///
1695/// This looks through the provided worklist as a list of candidate basic
1696/// blocks and select the most profitable one to place. The definition of
1697/// profitable only really makes sense in the context of a loop. This returns
1698/// the most frequently visited block in the worklist, which in the case of
1699/// a loop, is the one most desirable to be physically close to the rest of the
1700/// loop body in order to improve i-cache behavior.
1701///
1702/// \returns The best block found, or null if none are viable.
1703MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1704 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1705 // Once we need to walk the worklist looking for a candidate, cleanup the
1706 // worklist of already placed entries.
1707 // FIXME: If this shows up on profiles, it could be folded (at the cost of
1708 // some code complexity) into the loop below.
1709 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1710 return BlockToChain.lookup(BB) == &Chain;
1711 });
1712
1713 if (WorkList.empty())
1714 return nullptr;
1715
1716 bool IsEHPad = WorkList[0]->isEHPad();
1717
1718 MachineBasicBlock *BestBlock = nullptr;
1719 BlockFrequency BestFreq;
1720 for (MachineBasicBlock *MBB : WorkList) {
1721 assert(MBB->isEHPad() == IsEHPad &&
1722 "EHPad mismatch between block and work list.");
1723
1724 BlockChain &SuccChain = *BlockToChain[MBB];
1725 if (&SuccChain == &Chain)
1726 continue;
1727
1728 assert(SuccChain.UnscheduledPredecessors == 0 &&
1729 "Found CFG-violating block");
1730
1731 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1732 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1733 MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1734
1735 // For ehpad, we layout the least probable first as to avoid jumping back
1736 // from least probable landingpads to more probable ones.
1737 //
1738 // FIXME: Using probability is probably (!) not the best way to achieve
1739 // this. We should probably have a more principled approach to layout
1740 // cleanup code.
1741 //
1742 // The goal is to get:
1743 //
1744 // +--------------------------+
1745 // | V
1746 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1747 //
1748 // Rather than:
1749 //
1750 // +-------------------------------------+
1751 // V |
1752 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1753 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1754 continue;
1755
1756 BestBlock = MBB;
1757 BestFreq = CandidateFreq;
1758 }
1759
1760 return BestBlock;
1761}
1762
1763/// Retrieve the first unplaced basic block.
1764///
1765/// This routine is called when we are unable to use the CFG to walk through
1766/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1767/// We walk through the function's blocks in order, starting from the
1768/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1769/// re-scanning the entire sequence on repeated calls to this routine.
1770MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1771 const BlockChain &PlacedChain,
1772 MachineFunction::iterator &PrevUnplacedBlockIt,
1773 const BlockFilterSet *BlockFilter) {
1774 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1775 ++I) {
1776 if (BlockFilter && !BlockFilter->count(&*I))
1777 continue;
1778 if (BlockToChain[&*I] != &PlacedChain) {
1779 PrevUnplacedBlockIt = I;
1780 // Now select the head of the chain to which the unplaced block belongs
1781 // as the block to place. This will force the entire chain to be placed,
1782 // and satisfies the requirements of merging chains.
1783 return *BlockToChain[&*I]->begin();
1784 }
1785 }
1786 return nullptr;
1787}
1788
1789void MachineBlockPlacement::fillWorkLists(
1790 const MachineBasicBlock *MBB,
1791 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1792 const BlockFilterSet *BlockFilter = nullptr) {
1793 BlockChain &Chain = *BlockToChain[MBB];
1794 if (!UpdatedPreds.insert(&Chain).second)
1795 return;
1796
1797 assert(
1798 Chain.UnscheduledPredecessors == 0 &&
1799 "Attempting to place block with unscheduled predecessors in worklist.");
1800 for (MachineBasicBlock *ChainBB : Chain) {
1801 assert(BlockToChain[ChainBB] == &Chain &&
1802 "Block in chain doesn't match BlockToChain map.");
1803 for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1804 if (BlockFilter && !BlockFilter->count(Pred))
1805 continue;
1806 if (BlockToChain[Pred] == &Chain)
1807 continue;
1808 ++Chain.UnscheduledPredecessors;
1809 }
1810 }
1811
1812 if (Chain.UnscheduledPredecessors != 0)
1813 return;
1814
1815 MachineBasicBlock *BB = *Chain.begin();
1816 if (BB->isEHPad())
1817 EHPadWorkList.push_back(BB);
1818 else
1819 BlockWorkList.push_back(BB);
1820}
1821
1822void MachineBlockPlacement::buildChain(
1823 const MachineBasicBlock *HeadBB, BlockChain &Chain,
1824 BlockFilterSet *BlockFilter) {
1825 assert(HeadBB && "BB must not be null.\n");
1826 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1827 MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1828
1829 const MachineBasicBlock *LoopHeaderBB = HeadBB;
1830 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1831 MachineBasicBlock *BB = *std::prev(Chain.end());
1832 while (true) {
1833 assert(BB && "null block found at end of chain in loop.");
1834 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1835 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1836
1837
1838 // Look for the best viable successor if there is one to place immediately
1839 // after this block.
1840 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1841 MachineBasicBlock* BestSucc = Result.BB;
1842 bool ShouldTailDup = Result.ShouldTailDup;
1843 if (allowTailDupPlacement())
1844 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc,
1845 Chain,
1846 BlockFilter));
1847
1848 // If an immediate successor isn't available, look for the best viable
1849 // block among those we've identified as not violating the loop's CFG at
1850 // this point. This won't be a fallthrough, but it will increase locality.
1851 if (!BestSucc)
1852 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1853 if (!BestSucc)
1854 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1855
1856 if (!BestSucc) {
1857 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1858 if (!BestSucc)
1859 break;
1860
1861 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1862 "layout successor until the CFG reduces\n");
1863 }
1864
1865 // Placement may have changed tail duplication opportunities.
1866 // Check for that now.
1867 if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1868 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1869 BlockFilter, PrevUnplacedBlockIt);
1870 // If the chosen successor was duplicated into BB, don't bother laying
1871 // it out, just go round the loop again with BB as the chain end.
1872 if (!BB->isSuccessor(BestSucc))
1873 continue;
1874 }
1875
1876 // Place this block, updating the datastructures to reflect its placement.
1877 BlockChain &SuccChain = *BlockToChain[BestSucc];
1878 // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1879 // we selected a successor that didn't fit naturally into the CFG.
1880 SuccChain.UnscheduledPredecessors = 0;
1881 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1882 << getBlockName(BestSucc) << "\n");
1883 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1884 Chain.merge(BestSucc, &SuccChain);
1885 BB = *std::prev(Chain.end());
1886 }
1887
1888 LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1889 << getBlockName(*Chain.begin()) << "\n");
1890}
1891
1892// If bottom of block BB has only one successor OldTop, in most cases it is
1893// profitable to move it before OldTop, except the following case:
1894//
1895// -->OldTop<-
1896// | . |
1897// | . |
1898// | . |
1899// ---Pred |
1900// | |
1901// BB-----
1902//
1903// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1904// layout the other successor below it, so it can't reduce taken branch.
1905// In this case we keep its original layout.
1906bool
1907MachineBlockPlacement::canMoveBottomBlockToTop(
1908 const MachineBasicBlock *BottomBlock,
1909 const MachineBasicBlock *OldTop) {
1910 if (BottomBlock->pred_size() != 1)
1911 return true;
1912 MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1913 if (Pred->succ_size() != 2)
1914 return true;
1915
1916 MachineBasicBlock *OtherBB = *Pred->succ_begin();
1917 if (OtherBB == BottomBlock)
1918 OtherBB = *Pred->succ_rbegin();
1919 if (OtherBB == OldTop)
1920 return false;
1921
1922 return true;
1923}
1924
1925// Find out the possible fall through frequence to the top of a loop.
1927MachineBlockPlacement::TopFallThroughFreq(
1928 const MachineBasicBlock *Top,
1929 const BlockFilterSet &LoopBlockSet) {
1930 BlockFrequency MaxFreq = 0;
1931 for (MachineBasicBlock *Pred : Top->predecessors()) {
1932 BlockChain *PredChain = BlockToChain[Pred];
1933 if (!LoopBlockSet.count(Pred) &&
1934 (!PredChain || Pred == *std::prev(PredChain->end()))) {
1935 // Found a Pred block can be placed before Top.
1936 // Check if Top is the best successor of Pred.
1937 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1938 bool TopOK = true;
1939 for (MachineBasicBlock *Succ : Pred->successors()) {
1940 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1941 BlockChain *SuccChain = BlockToChain[Succ];
1942 // Check if Succ can be placed after Pred.
1943 // Succ should not be in any chain, or it is the head of some chain.
1944 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1945 (!SuccChain || Succ == *SuccChain->begin())) {
1946 TopOK = false;
1947 break;
1948 }
1949 }
1950 if (TopOK) {
1951 BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1952 MBPI->getEdgeProbability(Pred, Top);
1953 if (EdgeFreq > MaxFreq)
1954 MaxFreq = EdgeFreq;
1955 }
1956 }
1957 }
1958 return MaxFreq;
1959}
1960
1961// Compute the fall through gains when move NewTop before OldTop.
1962//
1963// In following diagram, edges marked as "-" are reduced fallthrough, edges
1964// marked as "+" are increased fallthrough, this function computes
1965//
1966// SUM(increased fallthrough) - SUM(decreased fallthrough)
1967//
1968// |
1969// | -
1970// V
1971// --->OldTop
1972// | .
1973// | .
1974// +| . +
1975// | Pred --->
1976// | |-
1977// | V
1978// --- NewTop <---
1979// |-
1980// V
1981//
1983MachineBlockPlacement::FallThroughGains(
1984 const MachineBasicBlock *NewTop,
1985 const MachineBasicBlock *OldTop,
1986 const MachineBasicBlock *ExitBB,
1987 const BlockFilterSet &LoopBlockSet) {
1988 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
1989 BlockFrequency FallThrough2Exit = 0;
1990 if (ExitBB)
1991 FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
1992 MBPI->getEdgeProbability(NewTop, ExitBB);
1993 BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
1994 MBPI->getEdgeProbability(NewTop, OldTop);
1995
1996 // Find the best Pred of NewTop.
1997 MachineBasicBlock *BestPred = nullptr;
1998 BlockFrequency FallThroughFromPred = 0;
1999 for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2000 if (!LoopBlockSet.count(Pred))
2001 continue;
2002 BlockChain *PredChain = BlockToChain[Pred];
2003 if (!PredChain || Pred == *std::prev(PredChain->end())) {
2004 BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
2005 MBPI->getEdgeProbability(Pred, NewTop);
2006 if (EdgeFreq > FallThroughFromPred) {
2007 FallThroughFromPred = EdgeFreq;
2008 BestPred = Pred;
2009 }
2010 }
2011 }
2012
2013 // If NewTop is not placed after Pred, another successor can be placed
2014 // after Pred.
2015 BlockFrequency NewFreq = 0;
2016 if (BestPred) {
2017 for (MachineBasicBlock *Succ : BestPred->successors()) {
2018 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2019 continue;
2020 if (ComputedEdges.contains(Succ))
2021 continue;
2022 BlockChain *SuccChain = BlockToChain[Succ];
2023 if ((SuccChain && (Succ != *SuccChain->begin())) ||
2024 (SuccChain == BlockToChain[BestPred]))
2025 continue;
2026 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2027 MBPI->getEdgeProbability(BestPred, Succ);
2028 if (EdgeFreq > NewFreq)
2029 NewFreq = EdgeFreq;
2030 }
2031 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2032 MBPI->getEdgeProbability(BestPred, NewTop);
2033 if (NewFreq > OrigEdgeFreq) {
2034 // If NewTop is not the best successor of Pred, then Pred doesn't
2035 // fallthrough to NewTop. So there is no FallThroughFromPred and
2036 // NewFreq.
2037 NewFreq = 0;
2038 FallThroughFromPred = 0;
2039 }
2040 }
2041
2043 BlockFrequency Gains = BackEdgeFreq + NewFreq;
2044 BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
2045 FallThroughFromPred;
2046 if (Gains > Lost)
2047 Result = Gains - Lost;
2048 return Result;
2049}
2050
2051/// Helper function of findBestLoopTop. Find the best loop top block
2052/// from predecessors of old top.
2053///
2054/// Look for a block which is strictly better than the old top for laying
2055/// out before the old top of the loop. This looks for only two patterns:
2056///
2057/// 1. a block has only one successor, the old loop top
2058///
2059/// Because such a block will always result in an unconditional jump,
2060/// rotating it in front of the old top is always profitable.
2061///
2062/// 2. a block has two successors, one is old top, another is exit
2063/// and it has more than one predecessors
2064///
2065/// If it is below one of its predecessors P, only P can fall through to
2066/// it, all other predecessors need a jump to it, and another conditional
2067/// jump to loop header. If it is moved before loop header, all its
2068/// predecessors jump to it, then fall through to loop header. So all its
2069/// predecessors except P can reduce one taken branch.
2070/// At the same time, move it before old top increases the taken branch
2071/// to loop exit block, so the reduced taken branch will be compared with
2072/// the increased taken branch to the loop exit block.
2074MachineBlockPlacement::findBestLoopTopHelper(
2075 MachineBasicBlock *OldTop,
2076 const MachineLoop &L,
2077 const BlockFilterSet &LoopBlockSet) {
2078 // Check that the header hasn't been fused with a preheader block due to
2079 // crazy branches. If it has, we need to start with the header at the top to
2080 // prevent pulling the preheader into the loop body.
2081 BlockChain &HeaderChain = *BlockToChain[OldTop];
2082 if (!LoopBlockSet.count(*HeaderChain.begin()))
2083 return OldTop;
2084 if (OldTop != *HeaderChain.begin())
2085 return OldTop;
2086
2087 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2088 << "\n");
2089
2090 BlockFrequency BestGains = 0;
2091 MachineBasicBlock *BestPred = nullptr;
2092 for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2093 if (!LoopBlockSet.count(Pred))
2094 continue;
2095 if (Pred == L.getHeader())
2096 continue;
2097 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2098 << Pred->succ_size() << " successors, ";
2099 MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
2100 if (Pred->succ_size() > 2)
2101 continue;
2102
2103 MachineBasicBlock *OtherBB = nullptr;
2104 if (Pred->succ_size() == 2) {
2105 OtherBB = *Pred->succ_begin();
2106 if (OtherBB == OldTop)
2107 OtherBB = *Pred->succ_rbegin();
2108 }
2109
2110 if (!canMoveBottomBlockToTop(Pred, OldTop))
2111 continue;
2112
2113 BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
2114 LoopBlockSet);
2115 if ((Gains > 0) && (Gains > BestGains ||
2116 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2117 BestPred = Pred;
2118 BestGains = Gains;
2119 }
2120 }
2121
2122 // If no direct predecessor is fine, just use the loop header.
2123 if (!BestPred) {
2124 LLVM_DEBUG(dbgs() << " final top unchanged\n");
2125 return OldTop;
2126 }
2127
2128 // Walk backwards through any straight line of predecessors.
2129 while (BestPred->pred_size() == 1 &&
2130 (*BestPred->pred_begin())->succ_size() == 1 &&
2131 *BestPred->pred_begin() != L.getHeader())
2132 BestPred = *BestPred->pred_begin();
2133
2134 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2135 return BestPred;
2136}
2137
2138/// Find the best loop top block for layout.
2139///
2140/// This function iteratively calls findBestLoopTopHelper, until no new better
2141/// BB can be found.
2143MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2144 const BlockFilterSet &LoopBlockSet) {
2145 // Placing the latch block before the header may introduce an extra branch
2146 // that skips this block the first time the loop is executed, which we want
2147 // to avoid when optimising for size.
2148 // FIXME: in theory there is a case that does not introduce a new branch,
2149 // i.e. when the layout predecessor does not fallthrough to the loop header.
2150 // In practice this never happens though: there always seems to be a preheader
2151 // that can fallthrough and that is also placed before the header.
2152 bool OptForSize = F->getFunction().hasOptSize() ||
2153 llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());
2154 if (OptForSize)
2155 return L.getHeader();
2156
2157 MachineBasicBlock *OldTop = nullptr;
2158 MachineBasicBlock *NewTop = L.getHeader();
2159 while (NewTop != OldTop) {
2160 OldTop = NewTop;
2161 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2162 if (NewTop != OldTop)
2163 ComputedEdges[NewTop] = { OldTop, false };
2164 }
2165 return NewTop;
2166}
2167
2168/// Find the best loop exiting block for layout.
2169///
2170/// This routine implements the logic to analyze the loop looking for the best
2171/// block to layout at the top of the loop. Typically this is done to maximize
2172/// fallthrough opportunities.
2174MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2175 const BlockFilterSet &LoopBlockSet,
2176 BlockFrequency &ExitFreq) {
2177 // We don't want to layout the loop linearly in all cases. If the loop header
2178 // is just a normal basic block in the loop, we want to look for what block
2179 // within the loop is the best one to layout at the top. However, if the loop
2180 // header has be pre-merged into a chain due to predecessors not having
2181 // analyzable branches, *and* the predecessor it is merged with is *not* part
2182 // of the loop, rotating the header into the middle of the loop will create
2183 // a non-contiguous range of blocks which is Very Bad. So start with the
2184 // header and only rotate if safe.
2185 BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2186 if (!LoopBlockSet.count(*HeaderChain.begin()))
2187 return nullptr;
2188
2189 BlockFrequency BestExitEdgeFreq;
2190 unsigned BestExitLoopDepth = 0;
2191 MachineBasicBlock *ExitingBB = nullptr;
2192 // If there are exits to outer loops, loop rotation can severely limit
2193 // fallthrough opportunities unless it selects such an exit. Keep a set of
2194 // blocks where rotating to exit with that block will reach an outer loop.
2195 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2196
2197 LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2198 << getBlockName(L.getHeader()) << "\n");
2199 for (MachineBasicBlock *MBB : L.getBlocks()) {
2200 BlockChain &Chain = *BlockToChain[MBB];
2201 // Ensure that this block is at the end of a chain; otherwise it could be
2202 // mid-way through an inner loop or a successor of an unanalyzable branch.
2203 if (MBB != *std::prev(Chain.end()))
2204 continue;
2205
2206 // Now walk the successors. We need to establish whether this has a viable
2207 // exiting successor and whether it has a viable non-exiting successor.
2208 // We store the old exiting state and restore it if a viable looping
2209 // successor isn't found.
2210 MachineBasicBlock *OldExitingBB = ExitingBB;
2211 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2212 bool HasLoopingSucc = false;
2213 for (MachineBasicBlock *Succ : MBB->successors()) {
2214 if (Succ->isEHPad())
2215 continue;
2216 if (Succ == MBB)
2217 continue;
2218 BlockChain &SuccChain = *BlockToChain[Succ];
2219 // Don't split chains, either this chain or the successor's chain.
2220 if (&Chain == &SuccChain) {
2221 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2222 << getBlockName(Succ) << " (chain conflict)\n");
2223 continue;
2224 }
2225
2226 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2227 if (LoopBlockSet.count(Succ)) {
2228 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2229 << getBlockName(Succ) << " (" << SuccProb << ")\n");
2230 HasLoopingSucc = true;
2231 continue;
2232 }
2233
2234 unsigned SuccLoopDepth = 0;
2235 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2236 SuccLoopDepth = ExitLoop->getLoopDepth();
2237 if (ExitLoop->contains(&L))
2238 BlocksExitingToOuterLoop.insert(MBB);
2239 }
2240
2241 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2242 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2243 << getBlockName(Succ) << " [L:" << SuccLoopDepth
2244 << "] (";
2245 MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
2246 // Note that we bias this toward an existing layout successor to retain
2247 // incoming order in the absence of better information. The exit must have
2248 // a frequency higher than the current exit before we consider breaking
2249 // the layout.
2250 BranchProbability Bias(100 - ExitBlockBias, 100);
2251 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2252 ExitEdgeFreq > BestExitEdgeFreq ||
2253 (MBB->isLayoutSuccessor(Succ) &&
2254 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2255 BestExitEdgeFreq = ExitEdgeFreq;
2256 ExitingBB = MBB;
2257 }
2258 }
2259
2260 if (!HasLoopingSucc) {
2261 // Restore the old exiting state, no viable looping successor was found.
2262 ExitingBB = OldExitingBB;
2263 BestExitEdgeFreq = OldBestExitEdgeFreq;
2264 }
2265 }
2266 // Without a candidate exiting block or with only a single block in the
2267 // loop, just use the loop header to layout the loop.
2268 if (!ExitingBB) {
2269 LLVM_DEBUG(
2270 dbgs() << " No other candidate exit blocks, using loop header\n");
2271 return nullptr;
2272 }
2273 if (L.getNumBlocks() == 1) {
2274 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2275 return nullptr;
2276 }
2277
2278 // Also, if we have exit blocks which lead to outer loops but didn't select
2279 // one of them as the exiting block we are rotating toward, disable loop
2280 // rotation altogether.
2281 if (!BlocksExitingToOuterLoop.empty() &&
2282 !BlocksExitingToOuterLoop.count(ExitingBB))
2283 return nullptr;
2284
2285 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2286 << "\n");
2287 ExitFreq = BestExitEdgeFreq;
2288 return ExitingBB;
2289}
2290
2291/// Check if there is a fallthrough to loop header Top.
2292///
2293/// 1. Look for a Pred that can be layout before Top.
2294/// 2. Check if Top is the most possible successor of Pred.
2295bool
2296MachineBlockPlacement::hasViableTopFallthrough(
2297 const MachineBasicBlock *Top,
2298 const BlockFilterSet &LoopBlockSet) {
2299 for (MachineBasicBlock *Pred : Top->predecessors()) {
2300 BlockChain *PredChain = BlockToChain[Pred];
2301 if (!LoopBlockSet.count(Pred) &&
2302 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2303 // Found a Pred block can be placed before Top.
2304 // Check if Top is the best successor of Pred.
2305 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2306 bool TopOK = true;
2307 for (MachineBasicBlock *Succ : Pred->successors()) {
2308 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2309 BlockChain *SuccChain = BlockToChain[Succ];
2310 // Check if Succ can be placed after Pred.
2311 // Succ should not be in any chain, or it is the head of some chain.
2312 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2313 TopOK = false;
2314 break;
2315 }
2316 }
2317 if (TopOK)
2318 return true;
2319 }
2320 }
2321 return false;
2322}
2323
2324/// Attempt to rotate an exiting block to the bottom of the loop.
2325///
2326/// Once we have built a chain, try to rotate it to line up the hot exit block
2327/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2328/// branches. For example, if the loop has fallthrough into its header and out
2329/// of its bottom already, don't rotate it.
2330void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2331 const MachineBasicBlock *ExitingBB,
2332 BlockFrequency ExitFreq,
2333 const BlockFilterSet &LoopBlockSet) {
2334 if (!ExitingBB)
2335 return;
2336
2337 MachineBasicBlock *Top = *LoopChain.begin();
2338 MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2339
2340 // If ExitingBB is already the last one in a chain then nothing to do.
2341 if (Bottom == ExitingBB)
2342 return;
2343
2344 // The entry block should always be the first BB in a function.
2345 if (Top->isEntryBlock())
2346 return;
2347
2348 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2349
2350 // If the header has viable fallthrough, check whether the current loop
2351 // bottom is a viable exiting block. If so, bail out as rotating will
2352 // introduce an unnecessary branch.
2353 if (ViableTopFallthrough) {
2354 for (MachineBasicBlock *Succ : Bottom->successors()) {
2355 BlockChain *SuccChain = BlockToChain[Succ];
2356 if (!LoopBlockSet.count(Succ) &&
2357 (!SuccChain || Succ == *SuccChain->begin()))
2358 return;
2359 }
2360
2361 // Rotate will destroy the top fallthrough, we need to ensure the new exit
2362 // frequency is larger than top fallthrough.
2363 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2364 if (FallThrough2Top >= ExitFreq)
2365 return;
2366 }
2367
2368 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2369 if (ExitIt == LoopChain.end())
2370 return;
2371
2372 // Rotating a loop exit to the bottom when there is a fallthrough to top
2373 // trades the entry fallthrough for an exit fallthrough.
2374 // If there is no bottom->top edge, but the chosen exit block does have
2375 // a fallthrough, we break that fallthrough for nothing in return.
2376
2377 // Let's consider an example. We have a built chain of basic blocks
2378 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2379 // By doing a rotation we get
2380 // Bk+1, ..., Bn, B1, ..., Bk
2381 // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2382 // If we had a fallthrough Bk -> Bk+1 it is broken now.
2383 // It might be compensated by fallthrough Bn -> B1.
2384 // So we have a condition to avoid creation of extra branch by loop rotation.
2385 // All below must be true to avoid loop rotation:
2386 // If there is a fallthrough to top (B1)
2387 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2388 // There is no fallthrough from bottom (Bn) to top (B1).
2389 // Please note that there is no exit fallthrough from Bn because we checked it
2390 // above.
2391 if (ViableTopFallthrough) {
2392 assert(std::next(ExitIt) != LoopChain.end() &&
2393 "Exit should not be last BB");
2394 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2395 if (ExitingBB->isSuccessor(NextBlockInChain))
2396 if (!Bottom->isSuccessor(Top))
2397 return;
2398 }
2399
2400 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2401 << " at bottom\n");
2402 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2403}
2404
2405/// Attempt to rotate a loop based on profile data to reduce branch cost.
2406///
2407/// With profile data, we can determine the cost in terms of missed fall through
2408/// opportunities when rotating a loop chain and select the best rotation.
2409/// Basically, there are three kinds of cost to consider for each rotation:
2410/// 1. The possibly missed fall through edge (if it exists) from BB out of
2411/// the loop to the loop header.
2412/// 2. The possibly missed fall through edges (if they exist) from the loop
2413/// exits to BB out of the loop.
2414/// 3. The missed fall through edge (if it exists) from the last BB to the
2415/// first BB in the loop chain.
2416/// Therefore, the cost for a given rotation is the sum of costs listed above.
2417/// We select the best rotation with the smallest cost.
2418void MachineBlockPlacement::rotateLoopWithProfile(
2419 BlockChain &LoopChain, const MachineLoop &L,
2420 const BlockFilterSet &LoopBlockSet) {
2421 auto RotationPos = LoopChain.end();
2422 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2423
2424 // The entry block should always be the first BB in a function.
2425 if (ChainHeaderBB->isEntryBlock())
2426 return;
2427
2428 BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2429
2430 // A utility lambda that scales up a block frequency by dividing it by a
2431 // branch probability which is the reciprocal of the scale.
2432 auto ScaleBlockFrequency = [](BlockFrequency Freq,
2433 unsigned Scale) -> BlockFrequency {
2434 if (Scale == 0)
2435 return 0;
2436 // Use operator / between BlockFrequency and BranchProbability to implement
2437 // saturating multiplication.
2438 return Freq / BranchProbability(1, Scale);
2439 };
2440
2441 // Compute the cost of the missed fall-through edge to the loop header if the
2442 // chain head is not the loop header. As we only consider natural loops with
2443 // single header, this computation can be done only once.
2444 BlockFrequency HeaderFallThroughCost(0);
2445 for (auto *Pred : ChainHeaderBB->predecessors()) {
2446 BlockChain *PredChain = BlockToChain[Pred];
2447 if (!LoopBlockSet.count(Pred) &&
2448 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2449 auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2450 MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2451 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2452 // If the predecessor has only an unconditional jump to the header, we
2453 // need to consider the cost of this jump.
2454 if (Pred->succ_size() == 1)
2455 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2456 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2457 }
2458 }
2459
2460 // Here we collect all exit blocks in the loop, and for each exit we find out
2461 // its hottest exit edge. For each loop rotation, we define the loop exit cost
2462 // as the sum of frequencies of exit edges we collect here, excluding the exit
2463 // edge from the tail of the loop chain.
2465 for (auto *BB : LoopChain) {
2466 auto LargestExitEdgeProb = BranchProbability::getZero();
2467 for (auto *Succ : BB->successors()) {
2468 BlockChain *SuccChain = BlockToChain[Succ];
2469 if (!LoopBlockSet.count(Succ) &&
2470 (!SuccChain || Succ == *SuccChain->begin())) {
2471 auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2472 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2473 }
2474 }
2475 if (LargestExitEdgeProb > BranchProbability::getZero()) {
2476 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2477 ExitsWithFreq.emplace_back(BB, ExitFreq);
2478 }
2479 }
2480
2481 // In this loop we iterate every block in the loop chain and calculate the
2482 // cost assuming the block is the head of the loop chain. When the loop ends,
2483 // we should have found the best candidate as the loop chain's head.
2484 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2485 EndIter = LoopChain.end();
2486 Iter != EndIter; Iter++, TailIter++) {
2487 // TailIter is used to track the tail of the loop chain if the block we are
2488 // checking (pointed by Iter) is the head of the chain.
2489 if (TailIter == LoopChain.end())
2490 TailIter = LoopChain.begin();
2491
2492 auto TailBB = *TailIter;
2493
2494 // Calculate the cost by putting this BB to the top.
2495 BlockFrequency Cost = 0;
2496
2497 // If the current BB is the loop header, we need to take into account the
2498 // cost of the missed fall through edge from outside of the loop to the
2499 // header.
2500 if (Iter != LoopChain.begin())
2501 Cost += HeaderFallThroughCost;
2502
2503 // Collect the loop exit cost by summing up frequencies of all exit edges
2504 // except the one from the chain tail.
2505 for (auto &ExitWithFreq : ExitsWithFreq)
2506 if (TailBB != ExitWithFreq.first)
2507 Cost += ExitWithFreq.second;
2508
2509 // The cost of breaking the once fall-through edge from the tail to the top
2510 // of the loop chain. Here we need to consider three cases:
2511 // 1. If the tail node has only one successor, then we will get an
2512 // additional jmp instruction. So the cost here is (MisfetchCost +
2513 // JumpInstCost) * tail node frequency.
2514 // 2. If the tail node has two successors, then we may still get an
2515 // additional jmp instruction if the layout successor after the loop
2516 // chain is not its CFG successor. Note that the more frequently executed
2517 // jmp instruction will be put ahead of the other one. Assume the
2518 // frequency of those two branches are x and y, where x is the frequency
2519 // of the edge to the chain head, then the cost will be
2520 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2521 // 3. If the tail node has more than two successors (this rarely happens),
2522 // we won't consider any additional cost.
2523 if (TailBB->isSuccessor(*Iter)) {
2524 auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2525 if (TailBB->succ_size() == 1)
2526 Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2528 else if (TailBB->succ_size() == 2) {
2529 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2530 auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2531 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2532 ? TailBBFreq * TailToHeadProb.getCompl()
2533 : TailToHeadFreq;
2534 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2535 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2536 }
2537 }
2538
2539 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2540 << getBlockName(*Iter)
2541 << " to the top: " << Cost.getFrequency() << "\n");
2542
2543 if (Cost < SmallestRotationCost) {
2544 SmallestRotationCost = Cost;
2545 RotationPos = Iter;
2546 }
2547 }
2548
2549 if (RotationPos != LoopChain.end()) {
2550 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2551 << " to the top\n");
2552 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2553 }
2554}
2555
2556/// Collect blocks in the given loop that are to be placed.
2557///
2558/// When profile data is available, exclude cold blocks from the returned set;
2559/// otherwise, collect all blocks in the loop.
2561MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2562 BlockFilterSet LoopBlockSet;
2563
2564 // Filter cold blocks off from LoopBlockSet when profile data is available.
2565 // Collect the sum of frequencies of incoming edges to the loop header from
2566 // outside. If we treat the loop as a super block, this is the frequency of
2567 // the loop. Then for each block in the loop, we calculate the ratio between
2568 // its frequency and the frequency of the loop block. When it is too small,
2569 // don't add it to the loop chain. If there are outer loops, then this block
2570 // will be merged into the first outer loop chain for which this block is not
2571 // cold anymore. This needs precise profile data and we only do this when
2572 // profile data is available.
2573 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2574 BlockFrequency LoopFreq(0);
2575 for (auto *LoopPred : L.getHeader()->predecessors())
2576 if (!L.contains(LoopPred))
2577 LoopFreq += MBFI->getBlockFreq(LoopPred) *
2578 MBPI->getEdgeProbability(LoopPred, L.getHeader());
2579
2580 for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2581 if (LoopBlockSet.count(LoopBB))
2582 continue;
2583 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2584 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2585 continue;
2586 BlockChain *Chain = BlockToChain[LoopBB];
2587 for (MachineBasicBlock *ChainBB : *Chain)
2588 LoopBlockSet.insert(ChainBB);
2589 }
2590 } else
2591 LoopBlockSet.insert(L.block_begin(), L.block_end());
2592
2593 return LoopBlockSet;
2594}
2595
2596/// Forms basic block chains from the natural loop structures.
2597///
2598/// These chains are designed to preserve the existing *structure* of the code
2599/// as much as possible. We can then stitch the chains together in a way which
2600/// both preserves the topological structure and minimizes taken conditional
2601/// branches.
2602void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2603 // First recurse through any nested loops, building chains for those inner
2604 // loops.
2605 for (const MachineLoop *InnerLoop : L)
2606 buildLoopChains(*InnerLoop);
2607
2608 assert(BlockWorkList.empty() &&
2609 "BlockWorkList not empty when starting to build loop chains.");
2610 assert(EHPadWorkList.empty() &&
2611 "EHPadWorkList not empty when starting to build loop chains.");
2612 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2613
2614 // Check if we have profile data for this function. If yes, we will rotate
2615 // this loop by modeling costs more precisely which requires the profile data
2616 // for better layout.
2617 bool RotateLoopWithProfile =
2619 (PreciseRotationCost && F->getFunction().hasProfileData());
2620
2621 // First check to see if there is an obviously preferable top block for the
2622 // loop. This will default to the header, but may end up as one of the
2623 // predecessors to the header if there is one which will result in strictly
2624 // fewer branches in the loop body.
2625 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2626
2627 // If we selected just the header for the loop top, look for a potentially
2628 // profitable exit block in the event that rotating the loop can eliminate
2629 // branches by placing an exit edge at the bottom.
2630 //
2631 // Loops are processed innermost to uttermost, make sure we clear
2632 // PreferredLoopExit before processing a new loop.
2633 PreferredLoopExit = nullptr;
2634 BlockFrequency ExitFreq;
2635 if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2636 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2637
2638 BlockChain &LoopChain = *BlockToChain[LoopTop];
2639
2640 // FIXME: This is a really lame way of walking the chains in the loop: we
2641 // walk the blocks, and use a set to prevent visiting a particular chain
2642 // twice.
2643 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2644 assert(LoopChain.UnscheduledPredecessors == 0 &&
2645 "LoopChain should not have unscheduled predecessors.");
2646 UpdatedPreds.insert(&LoopChain);
2647
2648 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2649 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2650
2651 buildChain(LoopTop, LoopChain, &LoopBlockSet);
2652
2653 if (RotateLoopWithProfile)
2654 rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2655 else
2656 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2657
2658 LLVM_DEBUG({
2659 // Crash at the end so we get all of the debugging output first.
2660 bool BadLoop = false;
2661 if (LoopChain.UnscheduledPredecessors) {
2662 BadLoop = true;
2663 dbgs() << "Loop chain contains a block without its preds placed!\n"
2664 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2665 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2666 }
2667 for (MachineBasicBlock *ChainBB : LoopChain) {
2668 dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2669 if (!LoopBlockSet.remove(ChainBB)) {
2670 // We don't mark the loop as bad here because there are real situations
2671 // where this can occur. For example, with an unanalyzable fallthrough
2672 // from a loop block to a non-loop block or vice versa.
2673 dbgs() << "Loop chain contains a block not contained by the loop!\n"
2674 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2675 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2676 << " Bad block: " << getBlockName(ChainBB) << "\n";
2677 }
2678 }
2679
2680 if (!LoopBlockSet.empty()) {
2681 BadLoop = true;
2682 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2683 dbgs() << "Loop contains blocks never placed into a chain!\n"
2684 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2685 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2686 << " Bad block: " << getBlockName(LoopBB) << "\n";
2687 }
2688 assert(!BadLoop && "Detected problems with the placement of this loop.");
2689 });
2690
2691 BlockWorkList.clear();
2692 EHPadWorkList.clear();
2693}
2694
2695void MachineBlockPlacement::buildCFGChains() {
2696 // Ensure that every BB in the function has an associated chain to simplify
2697 // the assumptions of the remaining algorithm.
2698 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2699 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2700 ++FI) {
2701 MachineBasicBlock *BB = &*FI;
2702 BlockChain *Chain =
2703 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2704 // Also, merge any blocks which we cannot reason about and must preserve
2705 // the exact fallthrough behavior for.
2706 while (true) {
2707 Cond.clear();
2708 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2709 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2710 break;
2711
2712 MachineFunction::iterator NextFI = std::next(FI);
2713 MachineBasicBlock *NextBB = &*NextFI;
2714 // Ensure that the layout successor is a viable block, as we know that
2715 // fallthrough is a possibility.
2716 assert(NextFI != FE && "Can't fallthrough past the last block.");
2717 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2718 << getBlockName(BB) << " -> " << getBlockName(NextBB)
2719 << "\n");
2720 Chain->merge(NextBB, nullptr);
2721#ifndef NDEBUG
2722 BlocksWithUnanalyzableExits.insert(&*BB);
2723#endif
2724 FI = NextFI;
2725 BB = NextBB;
2726 }
2727 }
2728
2729 // Build any loop-based chains.
2730 PreferredLoopExit = nullptr;
2731 for (MachineLoop *L : *MLI)
2732 buildLoopChains(*L);
2733
2734 assert(BlockWorkList.empty() &&
2735 "BlockWorkList should be empty before building final chain.");
2736 assert(EHPadWorkList.empty() &&
2737 "EHPadWorkList should be empty before building final chain.");
2738
2739 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2740 for (MachineBasicBlock &MBB : *F)
2741 fillWorkLists(&MBB, UpdatedPreds);
2742
2743 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2744 buildChain(&F->front(), FunctionChain);
2745
2746#ifndef NDEBUG
2747 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2748#endif
2749 LLVM_DEBUG({
2750 // Crash at the end so we get all of the debugging output first.
2751 bool BadFunc = false;
2752 FunctionBlockSetType FunctionBlockSet;
2753 for (MachineBasicBlock &MBB : *F)
2754 FunctionBlockSet.insert(&MBB);
2755
2756 for (MachineBasicBlock *ChainBB : FunctionChain)
2757 if (!FunctionBlockSet.erase(ChainBB)) {
2758 BadFunc = true;
2759 dbgs() << "Function chain contains a block not in the function!\n"
2760 << " Bad block: " << getBlockName(ChainBB) << "\n";
2761 }
2762
2763 if (!FunctionBlockSet.empty()) {
2764 BadFunc = true;
2765 for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2766 dbgs() << "Function contains blocks never placed into a chain!\n"
2767 << " Bad block: " << getBlockName(RemainingBB) << "\n";
2768 }
2769 assert(!BadFunc && "Detected problems with the block placement.");
2770 });
2771
2772 // Remember original layout ordering, so we can update terminators after
2773 // reordering to point to the original layout successor.
2774 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2775 F->getNumBlockIDs());
2776 {
2777 MachineBasicBlock *LastMBB = nullptr;
2778 for (auto &MBB : *F) {
2779 if (LastMBB != nullptr)
2780 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2781 LastMBB = &MBB;
2782 }
2783 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2784 }
2785
2786 // Splice the blocks into place.
2787 MachineFunction::iterator InsertPos = F->begin();
2788 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2789 for (MachineBasicBlock *ChainBB : FunctionChain) {
2790 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2791 : " ... ")
2792 << getBlockName(ChainBB) << "\n");
2793 if (InsertPos != MachineFunction::iterator(ChainBB))
2794 F->splice(InsertPos, ChainBB);
2795 else
2796 ++InsertPos;
2797
2798 // Update the terminator of the previous block.
2799 if (ChainBB == *FunctionChain.begin())
2800 continue;
2801 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2802
2803 // FIXME: It would be awesome of updateTerminator would just return rather
2804 // than assert when the branch cannot be analyzed in order to remove this
2805 // boiler plate.
2806 Cond.clear();
2807 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2808
2809#ifndef NDEBUG
2810 if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2811 // Given the exact block placement we chose, we may actually not _need_ to
2812 // be able to edit PrevBB's terminator sequence, but not being _able_ to
2813 // do that at this point is a bug.
2814 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2815 !PrevBB->canFallThrough()) &&
2816 "Unexpected block with un-analyzable fallthrough!");
2817 Cond.clear();
2818 TBB = FBB = nullptr;
2819 }
2820#endif
2821
2822 // The "PrevBB" is not yet updated to reflect current code layout, so,
2823 // o. it may fall-through to a block without explicit "goto" instruction
2824 // before layout, and no longer fall-through it after layout; or
2825 // o. just opposite.
2826 //
2827 // analyzeBranch() may return erroneous value for FBB when these two
2828 // situations take place. For the first scenario FBB is mistakenly set NULL;
2829 // for the 2nd scenario, the FBB, which is expected to be NULL, is
2830 // mistakenly pointing to "*BI".
2831 // Thus, if the future change needs to use FBB before the layout is set, it
2832 // has to correct FBB first by using the code similar to the following:
2833 //
2834 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2835 // PrevBB->updateTerminator();
2836 // Cond.clear();
2837 // TBB = FBB = nullptr;
2838 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2839 // // FIXME: This should never take place.
2840 // TBB = FBB = nullptr;
2841 // }
2842 // }
2843 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2844 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2845 }
2846 }
2847
2848 // Fixup the last block.
2849 Cond.clear();
2850 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2851 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2852 MachineBasicBlock *PrevBB = &F->back();
2853 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2854 }
2855
2856 BlockWorkList.clear();
2857 EHPadWorkList.clear();
2858}
2859
2860void MachineBlockPlacement::optimizeBranches() {
2861 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2862 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2863
2864 // Now that all the basic blocks in the chain have the proper layout,
2865 // make a final call to analyzeBranch with AllowModify set.
2866 // Indeed, the target may be able to optimize the branches in a way we
2867 // cannot because all branches may not be analyzable.
2868 // E.g., the target may be able to remove an unconditional branch to
2869 // a fallthrough when it occurs after predicated terminators.
2870 for (MachineBasicBlock *ChainBB : FunctionChain) {
2871 Cond.clear();
2872 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2873 if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2874 // If PrevBB has a two-way branch, try to re-order the branches
2875 // such that we branch to the successor with higher probability first.
2876 if (TBB && !Cond.empty() && FBB &&
2877 MBPI->getEdgeProbability(ChainBB, FBB) >
2878 MBPI->getEdgeProbability(ChainBB, TBB) &&
2880 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2881 << getBlockName(ChainBB) << "\n");
2882 LLVM_DEBUG(dbgs() << " Edge probability: "
2883 << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2884 << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2885 DebugLoc dl; // FIXME: this is nowhere
2886 TII->removeBranch(*ChainBB);
2887 TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2888 }
2889 }
2890 }
2891}
2892
2893void MachineBlockPlacement::alignBlocks() {
2894 // Walk through the backedges of the function now that we have fully laid out
2895 // the basic blocks and align the destination of each backedge. We don't rely
2896 // exclusively on the loop info here so that we can align backedges in
2897 // unnatural CFGs and backedges that were introduced purely because of the
2898 // loop rotations done during this layout pass.
2899 if (F->getFunction().hasMinSize() ||
2900 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2901 return;
2902 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2903 if (FunctionChain.begin() == FunctionChain.end())
2904 return; // Empty chain.
2905
2906 const BranchProbability ColdProb(1, 5); // 20%
2907 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2908 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2909 for (MachineBasicBlock *ChainBB : FunctionChain) {
2910 if (ChainBB == *FunctionChain.begin())
2911 continue;
2912
2913 // Don't align non-looping basic blocks. These are unlikely to execute
2914 // enough times to matter in practice. Note that we'll still handle
2915 // unnatural CFGs inside of a natural outer loop (the common case) and
2916 // rotated loops.
2917 MachineLoop *L = MLI->getLoopFor(ChainBB);
2918 if (!L)
2919 continue;
2920
2921 const Align Align = TLI->getPrefLoopAlignment(L);
2922 if (Align == 1)
2923 continue; // Don't care about loop alignment.
2924
2925 // If the block is cold relative to the function entry don't waste space
2926 // aligning it.
2927 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2928 if (Freq < WeightedEntryFreq)
2929 continue;
2930
2931 // If the block is cold relative to its loop header, don't align it
2932 // regardless of what edges into the block exist.
2933 MachineBasicBlock *LoopHeader = L->getHeader();
2934 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2935 if (Freq < (LoopHeaderFreq * ColdProb))
2936 continue;
2937
2938 // If the global profiles indicates so, don't align it.
2939 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
2940 !TLI->alignLoopsWithOptSize())
2941 continue;
2942
2943 // Check for the existence of a non-layout predecessor which would benefit
2944 // from aligning this block.
2945 MachineBasicBlock *LayoutPred =
2946 &*std::prev(MachineFunction::iterator(ChainBB));
2947
2948 auto DetermineMaxAlignmentPadding = [&]() {
2949 // Set the maximum bytes allowed to be emitted for alignment.
2950 unsigned MaxBytes;
2951 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
2953 else
2954 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
2955 ChainBB->setMaxBytesForAlignment(MaxBytes);
2956 };
2957
2958 // Force alignment if all the predecessors are jumps. We already checked
2959 // that the block isn't cold above.
2960 if (!LayoutPred->isSuccessor(ChainBB)) {
2961 ChainBB->setAlignment(Align);
2962 DetermineMaxAlignmentPadding();
2963 continue;
2964 }
2965
2966 // Align this block if the layout predecessor's edge into this block is
2967 // cold relative to the block. When this is true, other predecessors make up
2968 // all of the hot entries into the block and thus alignment is likely to be
2969 // important.
2970 BranchProbability LayoutProb =
2971 MBPI->getEdgeProbability(LayoutPred, ChainBB);
2972 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2973 if (LayoutEdgeFreq <= (Freq * ColdProb)) {
2974 ChainBB->setAlignment(Align);
2975 DetermineMaxAlignmentPadding();
2976 }
2977 }
2978}
2979
2980/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2981/// it was duplicated into its chain predecessor and removed.
2982/// \p BB - Basic block that may be duplicated.
2983///
2984/// \p LPred - Chosen layout predecessor of \p BB.
2985/// Updated to be the chain end if LPred is removed.
2986/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2987/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2988/// Used to identify which blocks to update predecessor
2989/// counts.
2990/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2991/// chosen in the given order due to unnatural CFG
2992/// only needed if \p BB is removed and
2993/// \p PrevUnplacedBlockIt pointed to \p BB.
2994/// @return true if \p BB was removed.
2995bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2997 const MachineBasicBlock *LoopHeaderBB,
2998 BlockChain &Chain, BlockFilterSet *BlockFilter,
2999 MachineFunction::iterator &PrevUnplacedBlockIt) {
3000 bool Removed, DuplicatedToLPred;
3001 bool DuplicatedToOriginalLPred;
3002 Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
3003 PrevUnplacedBlockIt,
3004 DuplicatedToLPred);
3005 if (!Removed)
3006 return false;
3007 DuplicatedToOriginalLPred = DuplicatedToLPred;
3008 // Iteratively try to duplicate again. It can happen that a block that is
3009 // duplicated into is still small enough to be duplicated again.
3010 // No need to call markBlockSuccessors in this case, as the blocks being
3011 // duplicated from here on are already scheduled.
3012 while (DuplicatedToLPred && Removed) {
3013 MachineBasicBlock *DupBB, *DupPred;
3014 // The removal callback causes Chain.end() to be updated when a block is
3015 // removed. On the first pass through the loop, the chain end should be the
3016 // same as it was on function entry. On subsequent passes, because we are
3017 // duplicating the block at the end of the chain, if it is removed the
3018 // chain will have shrunk by one block.
3019 BlockChain::iterator ChainEnd = Chain.end();
3020 DupBB = *(--ChainEnd);
3021 // Now try to duplicate again.
3022 if (ChainEnd == Chain.begin())
3023 break;
3024 DupPred = *std::prev(ChainEnd);
3025 Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
3026 PrevUnplacedBlockIt,
3027 DuplicatedToLPred);
3028 }
3029 // If BB was duplicated into LPred, it is now scheduled. But because it was
3030 // removed, markChainSuccessors won't be called for its chain. Instead we
3031 // call markBlockSuccessors for LPred to achieve the same effect. This must go
3032 // at the end because repeating the tail duplication can increase the number
3033 // of unscheduled predecessors.
3034 LPred = *std::prev(Chain.end());
3035 if (DuplicatedToOriginalLPred)
3036 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3037 return true;
3038}
3039
3040/// Tail duplicate \p BB into (some) predecessors if profitable.
3041/// \p BB - Basic block that may be duplicated
3042/// \p LPred - Chosen layout predecessor of \p BB
3043/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3044/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3045/// Used to identify which blocks to update predecessor
3046/// counts.
3047/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3048/// chosen in the given order due to unnatural CFG
3049/// only needed if \p BB is removed and
3050/// \p PrevUnplacedBlockIt pointed to \p BB.
3051/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3052/// \return - True if the block was duplicated into all preds and removed.
3053bool MachineBlockPlacement::maybeTailDuplicateBlock(
3055 BlockChain &Chain, BlockFilterSet *BlockFilter,
3056 MachineFunction::iterator &PrevUnplacedBlockIt,
3057 bool &DuplicatedToLPred) {
3058 DuplicatedToLPred = false;
3059 if (!shouldTailDuplicate(BB))
3060 return false;
3061
3062 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3063 << "\n");
3064
3065 // This has to be a callback because none of it can be done after
3066 // BB is deleted.
3067 bool Removed = false;
3068 auto RemovalCallback =
3069 [&](MachineBasicBlock *RemBB) {
3070 // Signal to outer function
3071 Removed = true;
3072
3073 // Conservative default.
3074 bool InWorkList = true;
3075 // Remove from the Chain and Chain Map
3076 if (BlockToChain.count(RemBB)) {
3077 BlockChain *Chain = BlockToChain[RemBB];
3078 InWorkList = Chain->UnscheduledPredecessors == 0;
3079 Chain->remove(RemBB);
3080 BlockToChain.erase(RemBB);
3081 }
3082
3083 // Handle the unplaced block iterator
3084 if (&(*PrevUnplacedBlockIt) == RemBB) {
3085 PrevUnplacedBlockIt++;
3086 }
3087
3088 // Handle the Work Lists
3089 if (InWorkList) {
3090 SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
3091 if (RemBB->isEHPad())
3092 RemoveList = EHPadWorkList;
3093 llvm::erase_value(RemoveList, RemBB);
3094 }
3095
3096 // Handle the filter set
3097 if (BlockFilter) {
3098 BlockFilter->remove(RemBB);
3099 }
3100
3101 // Remove the block from loop info.
3102 MLI->removeBlock(RemBB);
3103 if (RemBB == PreferredLoopExit)
3104 PreferredLoopExit = nullptr;
3105
3106 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
3107 << getBlockName(RemBB) << "\n");
3108 };
3109 auto RemovalCallbackRef =
3111
3113 bool IsSimple = TailDup.isSimpleBB(BB);
3115 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3116 if (F->getFunction().hasProfileData()) {
3117 // We can do partial duplication with precise profile information.
3118 findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3119 if (CandidatePreds.size() == 0)
3120 return false;
3121 if (CandidatePreds.size() < BB->pred_size())
3122 CandidatePtr = &CandidatePreds;
3123 }
3124 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3125 &RemovalCallbackRef, CandidatePtr);
3126
3127 // Update UnscheduledPredecessors to reflect tail-duplication.
3128 DuplicatedToLPred = false;
3129 for (MachineBasicBlock *Pred : DuplicatedPreds) {
3130 // We're only looking for unscheduled predecessors that match the filter.
3131 BlockChain* PredChain = BlockToChain[Pred];
3132 if (Pred == LPred)
3133 DuplicatedToLPred = true;
3134 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
3135 || PredChain == &Chain)
3136 continue;
3137 for (MachineBasicBlock *NewSucc : Pred->successors()) {
3138 if (BlockFilter && !BlockFilter->count(NewSucc))
3139 continue;
3140 BlockChain *NewChain = BlockToChain[NewSucc];
3141 if (NewChain != &Chain && NewChain != PredChain)
3142 NewChain->UnscheduledPredecessors++;
3143 }
3144 }
3145 return Removed;
3146}
3147
3148// Count the number of actual machine instructions.
3150 uint64_t InstrCount = 0;
3151 for (MachineInstr &MI : *MBB) {
3152 if (!MI.isPHI() && !MI.isMetaInstruction())
3153 InstrCount += 1;
3154 }
3155 return InstrCount;
3156}
3157
3158// The size cost of duplication is the instruction size of the duplicated block.
3159// So we should scale the threshold accordingly. But the instruction size is not
3160// available on all targets, so we use the number of instructions instead.
3161BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3162 return DupThreshold.getFrequency() * countMBBInstruction(BB);
3163}
3164
3165// Returns true if BB is Pred's best successor.
3166bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3167 MachineBasicBlock *Pred,
3168 BlockFilterSet *BlockFilter) {
3169 if (BB == Pred)
3170 return false;
3171 if (BlockFilter && !BlockFilter->count(Pred))
3172 return false;
3173 BlockChain *PredChain = BlockToChain[Pred];
3174 if (PredChain && (Pred != *std::prev(PredChain->end())))
3175 return false;
3176
3177 // Find the successor with largest probability excluding BB.
3179 for (MachineBasicBlock *Succ : Pred->successors())
3180 if (Succ != BB) {
3181 if (BlockFilter && !BlockFilter->count(Succ))
3182 continue;
3183 BlockChain *SuccChain = BlockToChain[Succ];
3184 if (SuccChain && (Succ != *SuccChain->begin()))
3185 continue;
3186 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3187 if (SuccProb > BestProb)
3188 BestProb = SuccProb;
3189 }
3190
3191 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3192 if (BBProb <= BestProb)
3193 return false;
3194
3195 // Compute the number of reduced taken branches if Pred falls through to BB
3196 // instead of another successor. Then compare it with threshold.
3197 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3198 BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3199 return Gain > scaleThreshold(BB);
3200}
3201
3202// Find out the predecessors of BB and BB can be beneficially duplicated into
3203// them.
3204void MachineBlockPlacement::findDuplicateCandidates(
3207 BlockFilterSet *BlockFilter) {
3208 MachineBasicBlock *Fallthrough = nullptr;
3209 BranchProbability DefaultBranchProb = BranchProbability::getZero();
3210 BlockFrequency BBDupThreshold(scaleThreshold(BB));
3213
3214 // Sort for highest frequency.
3215 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3216 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3217 };
3218 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3219 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3220 };
3221 llvm::stable_sort(Succs, CmpSucc);
3222 llvm::stable_sort(Preds, CmpPred);
3223
3224 auto SuccIt = Succs.begin();
3225 if (SuccIt != Succs.end()) {
3226 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3227 }
3228
3229 // For each predecessors of BB, compute the benefit of duplicating BB,
3230 // if it is larger than the threshold, add it into Candidates.
3231 //
3232 // If we have following control flow.
3233 //
3234 // PB1 PB2 PB3 PB4
3235 // \ | / /\
3236 // \ | / / \
3237 // \ |/ / \
3238 // BB----/ OB
3239 // /\
3240 // / \
3241 // SB1 SB2
3242 //
3243 // And it can be partially duplicated as
3244 //
3245 // PB2+BB
3246 // | PB1 PB3 PB4
3247 // | | / /\
3248 // | | / / \
3249 // | |/ / \
3250 // | BB----/ OB
3251 // |\ /|
3252 // | X |
3253 // |/ \|
3254 // SB2 SB1
3255 //
3256 // The benefit of duplicating into a predecessor is defined as
3257 // Orig_taken_branch - Duplicated_taken_branch
3258 //
3259 // The Orig_taken_branch is computed with the assumption that predecessor
3260 // jumps to BB and the most possible successor is laid out after BB.
3261 //
3262 // The Duplicated_taken_branch is computed with the assumption that BB is
3263 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3264 // SB2 for PB2 in our case). If there is no available successor, the combined
3265 // block jumps to all BB's successor, like PB3 in this example.
3266 //
3267 // If a predecessor has multiple successors, so BB can't be duplicated into
3268 // it. But it can beneficially fall through to BB, and duplicate BB into other
3269 // predecessors.
3270 for (MachineBasicBlock *Pred : Preds) {
3271 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3272
3273 if (!TailDup.canTailDuplicate(BB, Pred)) {
3274 // BB can't be duplicated into Pred, but it is possible to be layout
3275 // below Pred.
3276 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3277 Fallthrough = Pred;
3278 if (SuccIt != Succs.end())
3279 SuccIt++;
3280 }
3281 continue;
3282 }
3283
3284 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3285 BlockFrequency DupCost;
3286 if (SuccIt == Succs.end()) {
3287 // Jump to all successors;
3288 if (Succs.size() > 0)
3289 DupCost += PredFreq;
3290 } else {
3291 // Fallthrough to *SuccIt, jump to all other successors;
3292 DupCost += PredFreq;
3293 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3294 }
3295
3296 assert(OrigCost >= DupCost);
3297 OrigCost -= DupCost;
3298 if (OrigCost > BBDupThreshold) {
3299 Candidates.push_back(Pred);
3300 if (SuccIt != Succs.end())
3301 SuccIt++;
3302 }
3303 }
3304
3305 // No predecessors can optimally fallthrough to BB.
3306 // So we can change one duplication into fallthrough.
3307 if (!Fallthrough) {
3308 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3309 Candidates[0] = Candidates.back();
3310 Candidates.pop_back();
3311 }
3312 }
3313}
3314
3315void MachineBlockPlacement::initDupThreshold() {
3316 DupThreshold = 0;
3317 if (!F->getFunction().hasProfileData())
3318 return;
3319
3320 // We prefer to use prifile count.
3321 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3322 if (HotThreshold != UINT64_MAX) {
3323 UseProfileCount = true;
3324 DupThreshold = HotThreshold * TailDupProfilePercentThreshold / 100;
3325 return;
3326 }
3327
3328 // Profile count is not available, we can use block frequency instead.
3329 BlockFrequency MaxFreq = 0;
3330 for (MachineBasicBlock &MBB : *F) {
3331 BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3332 if (Freq > MaxFreq)
3333 MaxFreq = Freq;
3334 }
3335
3336 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3337 DupThreshold = MaxFreq * ThresholdProb;
3338 UseProfileCount = false;
3339}
3340
3341bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3342 if (skipFunction(MF.getFunction()))
3343 return false;
3344
3345 // Check for single-block functions and skip them.
3346 if (std::next(MF.begin()) == MF.end())
3347 return false;
3348
3349 F = &MF;
3350 MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3351 MBFI = std::make_unique<MBFIWrapper>(
3352 getAnalysis<MachineBlockFrequencyInfo>());
3353 MLI = &getAnalysis<MachineLoopInfo>();
3354 TII = MF.getSubtarget().getInstrInfo();
3355 TLI = MF.getSubtarget().getTargetLowering();
3356 MPDT = nullptr;
3357 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
3358
3359 initDupThreshold();
3360
3361 // Initialize PreferredLoopExit to nullptr here since it may never be set if
3362 // there are no MachineLoops.
3363 PreferredLoopExit = nullptr;
3364
3365 assert(BlockToChain.empty() &&
3366 "BlockToChain map should be empty before starting placement.");
3367 assert(ComputedEdges.empty() &&
3368 "Computed Edge map should be empty before starting placement.");
3369
3370 unsigned TailDupSize = TailDupPlacementThreshold;
3371 // If only the aggressive threshold is explicitly set, use it.
3372 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3373 TailDupPlacementThreshold.getNumOccurrences() == 0)
3375
3376 TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
3377 // For aggressive optimization, we can adjust some thresholds to be less
3378 // conservative.
3379 if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) {
3380 // At O3 we should be more willing to copy blocks for tail duplication. This
3381 // increases size pressure, so we only do it at O3
3382 // Do this unless only the regular threshold is explicitly set.
3383 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3384 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3386 }
3387
3388 // If there's no threshold provided through options, query the target
3389 // information for a threshold instead.
3390 if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3391 (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive ||
3392 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3393 TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
3394
3395 if (allowTailDupPlacement()) {
3396 MPDT = &getAnalysis<MachinePostDominatorTree>();
3397 bool OptForSize = MF.getFunction().hasOptSize() ||
3398 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3399 if (OptForSize)
3400 TailDupSize = 1;
3401 bool PreRegAlloc = false;
3402 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3403 /* LayoutMode */ true, TailDupSize);
3404 precomputeTriangleChains();
3405 }
3406
3407 buildCFGChains();
3408
3409 // Changing the layout can create new tail merging opportunities.
3410 // TailMerge can create jump into if branches that make CFG irreducible for
3411 // HW that requires structured CFG.
3412 bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3413 PassConfig->getEnableTailMerge() &&
3415 // No tail merging opportunities if the block number is less than four.
3416 if (MF.size() > 3 && EnableTailMerge) {
3417 unsigned TailMergeSize = TailDupSize + 1;
3418 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3419 *MBFI, *MBPI, PSI, TailMergeSize);
3420
3421 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3422 /*AfterPlacement=*/true)) {
3423 // Redo the layout if tail merging creates/removes/moves blocks.
3424 BlockToChain.clear();
3425 ComputedEdges.clear();
3426 // Must redo the post-dominator tree if blocks were changed.
3427 if (MPDT)
3428 MPDT->runOnMachineFunction(MF);
3429 ChainAllocator.DestroyAll();
3430 buildCFGChains();
3431 }
3432 }
3433
3434 // Apply a post-processing optimizing block placement.
3435 if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
3437 // Find a new placement and modify the layout of the blocks in the function.
3438 applyExtTsp();
3439
3440 // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3441 createCFGChainExtTsp();
3442 }
3443
3444 optimizeBranches();
3445 alignBlocks();
3446
3447 BlockToChain.clear();
3448 ComputedEdges.clear();
3449 ChainAllocator.DestroyAll();
3450
3451 bool HasMaxBytesOverride =
3452 MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3453
3454 if (AlignAllBlock)
3455 // Align all of the blocks in the function to a specific alignment.
3456 for (MachineBasicBlock &MBB : MF) {
3457 if (HasMaxBytesOverride)
3460 else
3462 }
3463 else if (AlignAllNonFallThruBlocks) {
3464 // Align all of the blocks that have no fall-through predecessors to a
3465 // specific alignment.
3466 for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
3467 auto LayoutPred = std::prev(MBI);
3468 if (!LayoutPred->isSuccessor(&*MBI)) {
3469 if (HasMaxBytesOverride)
3470 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3472 else
3473 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3474 }
3475 }
3476 }
3478 (ViewBlockFreqFuncName.empty() ||
3479 F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
3481 MF.RenumberBlocks();
3482 MBFI->view("MBP." + MF.getName(), false);
3483 }
3484
3485 // We always return true as we have no way to track whether the final order
3486 // differs from the original order.
3487 return true;
3488}
3489
3490void MachineBlockPlacement::applyExtTsp() {
3491 // Prepare data; blocks are indexed by their index in the current ordering.
3493 BlockIndex.reserve(F->size());
3494 std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3495 CurrentBlockOrder.reserve(F->size());
3496 size_t NumBlocks = 0;
3497 for (const MachineBasicBlock &MBB : *F) {
3498 BlockIndex[&MBB] = NumBlocks++;
3499 CurrentBlockOrder.push_back(&MBB);
3500 }
3501
3502 auto BlockSizes = std::vector<uint64_t>(F->size());
3503 auto BlockCounts = std::vector<uint64_t>(F->size());
3504 std::vector<codelayout::EdgeCount> JumpCounts;
3505 for (MachineBasicBlock &MBB : *F) {
3506 // Getting the block frequency.
3507 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3508 BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
3509 // Getting the block size:
3510 // - approximate the size of an instruction by 4 bytes, and
3511 // - ignore debug instructions.
3512 // Note: getting the exact size of each block is target-dependent and can be
3513 // done by extending the interface of MCCodeEmitter. Experimentally we do
3514 // not see a perf improvement with the exact block sizes.
3515 auto NonDbgInsts =
3517 int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3518 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3519 // Getting jump frequencies.
3520 for (MachineBasicBlock *Succ : MBB.successors()) {
3521 auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3522 BlockFrequency JumpFreq = BlockFreq * EP;
3523 JumpCounts.push_back(
3524 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3525 }
3526 }
3527
3528 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3529 << " with profile = " << F->getFunction().hasProfileData()
3530 << " (" << F->getName().str() << ")"
3531 << "\n");
3532 LLVM_DEBUG(
3533 dbgs() << format(" original layout score: %0.2f\n",
3534 calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
3535
3536 // Run the layout algorithm.
3537 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3538 std::vector<const MachineBasicBlock *> NewBlockOrder;
3539 NewBlockOrder.reserve(F->size());
3540 for (uint64_t Node : NewOrder) {
3541 NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3542 }
3543 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n",
3544 calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
3545 JumpCounts)));
3546
3547 // Assign new block order.
3548 assignBlockOrder(NewBlockOrder);
3549}
3550
3551void MachineBlockPlacement::assignBlockOrder(
3552 const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3553 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3554 F->RenumberBlocks();
3555
3556 bool HasChanges = false;
3557 for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3558 if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3559 HasChanges = true;
3560 break;
3561 }
3562 }
3563 // Stop early if the new block order is identical to the existing one.
3564 if (!HasChanges)
3565 return;
3566
3567 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3568 for (auto &MBB : *F) {
3569 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3570 }
3571
3572 // Sort basic blocks in the function according to the computed order.
3574 for (const MachineBasicBlock *MBB : NewBlockOrder) {
3575 NewIndex[MBB] = NewIndex.size();
3576 }
3577 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3578 return NewIndex[&L] < NewIndex[&R];
3579 });
3580
3581 // Update basic block branches by inserting explicit fallthrough branches
3582 // when required and re-optimize branches when possible.
3583 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3585 for (auto &MBB : *F) {
3586 MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3588 auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3589 // If this block had a fallthrough before we need an explicit unconditional
3590 // branch to that block if the fallthrough block is not adjacent to the
3591 // block in the new order.
3592 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3593 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3594 }
3595
3596 // It might be possible to optimize branches by flipping the condition.
3597 Cond.clear();
3598 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3599 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3600 continue;
3601 MBB.updateTerminator(FTMBB);
3602 }
3603
3604#ifndef NDEBUG
3605 // Make sure we correctly constructed all branches.
3606 F->verify(this, "After optimized block reordering");
3607#endif
3608}
3609
3610void MachineBlockPlacement::createCFGChainExtTsp() {
3611 BlockToChain.clear();
3612 ComputedEdges.clear();
3613 ChainAllocator.DestroyAll();
3614
3615 MachineBasicBlock *HeadBB = &F->front();
3616 BlockChain *FunctionChain =
3617 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3618
3619 for (MachineBasicBlock &MBB : *F) {
3620 if (HeadBB == &MBB)
3621 continue; // Ignore head of the chain
3622 FunctionChain->merge(&MBB, nullptr);
3623 }
3624}
3625
3626namespace {
3627
3628/// A pass to compute block placement statistics.
3629///
3630/// A separate pass to compute interesting statistics for evaluating block
3631/// placement. This is separate from the actual placement pass so that they can
3632/// be computed in the absence of any placement transformations or when using
3633/// alternative placement strategies.
3634class MachineBlockPlacementStats : public MachineFunctionPass {
3635 /// A handle to the branch probability pass.
3636 const MachineBranchProbabilityInfo *MBPI;
3637
3638 /// A handle to the function-wide block frequency pass.
3639 const MachineBlockFrequencyInfo *MBFI;
3640
3641public:
3642 static char ID; // Pass identification, replacement for typeid
3643
3644 MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3646 }
3647
3648 bool runOnMachineFunction(MachineFunction &F) override;
3649
3650 void getAnalysisUsage(AnalysisUsage &AU) const override {
3653 AU.setPreservesAll();
3655 }
3656};
3657
3658} // end anonymous namespace
3659
3660char MachineBlockPlacementStats::ID = 0;
3661
3662char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
3663
3664INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3665 "Basic Block Placement Stats", false, false)
3668INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3670
3671bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3672 // Check for single-block functions and skip them.
3673 if (std::next(F.begin()) == F.end())
3674 return false;
3675
3676 if (!isFunctionInPrintList(F.getName()))
3677 return false;
3678
3679 MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3680 MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
3681
3682 for (MachineBasicBlock &MBB : F) {
3683 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3684 Statistic &NumBranches =
3685 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3686 Statistic &BranchTakenFreq =
3687 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3688 for (MachineBasicBlock *Succ : MBB.successors()) {
3689 // Skip if this successor is a fallthrough.
3690 if (MBB.isLayoutSuccessor(Succ))
3691 continue;
3692
3693 BlockFrequency EdgeFreq =
3694 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3695 ++NumBranches;
3696 BranchTakenFreq += EdgeFreq.getFrequency();
3697 }
3698 }
3699
3700 return false;
3701}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
MachineBasicBlock & MBB
This file defines the BumpPtrAllocator interface.
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Declares methods and data structures for code layout algorithms.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:510
static unsigned InstrCount
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:496
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
block placement Basic Block Placement Stats
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
Branch Probability Basic Block Placement
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
#define P(N)
if(VerifyEach)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
unify loop Fixup each natural loop to have a single exit block
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
static BranchProbability getOne()
uint32_t getNumerator() const
BranchProbability getCompl() const
static BranchProbability getZero()
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:647
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:291
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
unsigned pred_size() const
bool isEHPad() const
Returns true if the block is a landing pad.
instr_iterator instr_begin()
MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
unsigned succ_size() const
void setAlignment(Align A)
Set alignment of the basic block.
bool isEntryBlock() const
Returns true if this is the entry block of the function.
succ_reverse_iterator succ_rbegin()
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
iterator_range< pred_iterator > predecessors()
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
unsigned size() const
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Representation of each machine instruction.
Definition: MachineInstr.h:68
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
size_type size() const
Definition: SmallPtrSet.h:93
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:345
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:384
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:582
typename SuperClass::iterator iterator
Definition: SmallVector.h:581
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:382
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:432
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:403
Utility class to perform tail duplication.
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
TargetInstrInfo - Interface to description of machine instruction set.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
bool requiresStructuredCFG() const
Target-Independent Code Generator Pass Configuration Options.
bool getEnableTailMerge() const
CodeGenOptLevel getOptLevel() const
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetLowering * getTargetLowering() const
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:82
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:642
#define UINT64_MAX
Definition: DataTypes.h:77
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
double calcExtTspScore(ArrayRef< uint64_t > Order, ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Estimate the "quality" of a given node order in CFG.
std::vector< uint64_t > computeExtTspLayout(ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:458
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void stable_sort(R &&Range)
Definition: STLExtras.h:1971
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1747
cl::opt< bool > ApplyExtTspWithoutProfile
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
cl::opt< unsigned > ProfileLikelyProb
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:429
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
bool isFunctionInPrintList(StringRef FunctionName)
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
void erase_value(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2029
cl::opt< unsigned > StaticLikelyProb
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1919
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2021
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
cl::opt< bool > EnableExtTspBlockPlacement
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void initializeMachineBlockPlacementPass(PassRegistry &)
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39