LLVM 23.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements basic block placement transformations using the CFG
10// structure and branch probability estimates.
11//
12// The pass strives to preserve the structure of the CFG (that is, retain
13// a topological ordering of basic blocks) in the absence of a *strong* signal
14// to the contrary from probabilities. However, within the CFG structure, it
15// attempts to choose an ordering which favors placing more likely sequences of
16// blocks adjacent to each other.
17//
18// The algorithm works from the inner-most loop within a function outward, and
19// at each stage walks through the basic blocks, trying to coalesce them into
20// sequential chains where allowed by the CFG (or demanded by heavy
21// probabilities). Finally, it walks the blocks in topological order, and the
22// first time it reaches a chain of basic blocks, it schedules them in the
23// function in-order.
24//
25//===----------------------------------------------------------------------===//
26
28#include "BranchFolding.h"
29#include "llvm/ADT/ArrayRef.h"
30#include "llvm/ADT/DenseMap.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
35#include "llvm/ADT/Statistic.h"
52#include "llvm/IR/DebugLoc.h"
53#include "llvm/IR/Function.h"
54#include "llvm/IR/PrintPasses.h"
56#include "llvm/Pass.h"
63#include "llvm/Support/Debug.h"
67#include <algorithm>
68#include <cassert>
69#include <cstdint>
70#include <iterator>
71#include <memory>
72#include <string>
73#include <tuple>
74#include <utility>
75#include <vector>
76
77using namespace llvm;
78
79#define DEBUG_TYPE "block-placement"
80
81STATISTIC(NumCondBranches, "Number of conditional branches");
82STATISTIC(NumUncondBranches, "Number of unconditional branches");
83STATISTIC(CondBranchTakenFreq,
84 "Potential frequency of taking conditional branches");
85STATISTIC(UncondBranchTakenFreq,
86 "Potential frequency of taking unconditional branches");
87
89 "align-all-blocks",
90 cl::desc("Force the alignment of all blocks in the function in log2 format "
91 "(e.g 4 means align on 16B boundaries)."),
93
95 "align-all-nofallthru-blocks",
96 cl::desc("Force the alignment of all blocks that have no fall-through "
97 "predecessors (i.e. don't add nops that are executed). In log2 "
98 "format (e.g 4 means align on 16B boundaries)."),
100
102 "max-bytes-for-alignment",
103 cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
104 "alignment"),
105 cl::init(0), cl::Hidden);
106
108 "block-placement-predecessor-limit",
109 cl::desc("For blocks with more predecessors, certain layout optimizations"
110 "will be disabled to prevent quadratic compile time."),
111 cl::init(1000), cl::Hidden);
112
113// FIXME: Find a good default for this flag and remove the flag.
115 "block-placement-exit-block-bias",
116 cl::desc("Block frequency percentage a loop exit block needs "
117 "over the original exit to be considered the new exit."),
118 cl::init(0), cl::Hidden);
119
120// Definition:
121// - Outlining: placement of a basic block outside the chain or hot path.
122
124 "loop-to-cold-block-ratio",
125 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
126 "(frequency of block) is greater than this ratio"),
127 cl::init(5), cl::Hidden);
128
129static cl::opt<bool>
130 ForceLoopColdBlock("force-loop-cold-block",
131 cl::desc("Force outlining cold blocks from loops."),
132 cl::init(false), cl::Hidden);
133
134static cl::opt<bool>
135 PreciseRotationCost("precise-rotation-cost",
136 cl::desc("Model the cost of loop rotation more "
137 "precisely by using profile data."),
138 cl::init(false), cl::Hidden);
139
140static cl::opt<bool>
141 ForcePreciseRotationCost("force-precise-rotation-cost",
142 cl::desc("Force the use of precise cost "
143 "loop rotation strategy."),
144 cl::init(false), cl::Hidden);
145
147 "misfetch-cost",
148 cl::desc("Cost that models the probabilistic risk of an instruction "
149 "misfetch due to a jump comparing to falling through, whose cost "
150 "is zero."),
151 cl::init(1), cl::Hidden);
152
153static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
154 cl::desc("Cost of jump instructions."),
155 cl::init(1), cl::Hidden);
156static cl::opt<bool>
157 TailDupPlacement("tail-dup-placement",
158 cl::desc("Perform tail duplication during placement. "
159 "Creates more fallthrough opportunities in "
160 "outline branches."),
161 cl::init(true), cl::Hidden);
162
163static cl::opt<bool>
164 BranchFoldPlacement("branch-fold-placement",
165 cl::desc("Perform branch folding during placement. "
166 "Reduces code size."),
167 cl::init(true), cl::Hidden);
168
169// Heuristic for tail duplication.
171 "tail-dup-placement-threshold",
172 cl::desc("Instruction cutoff for tail duplication during layout. "
173 "Tail merging during layout is forced to have a threshold "
174 "that won't conflict."),
175 cl::init(2), cl::Hidden);
176
177// Heuristic for aggressive tail duplication.
179 "tail-dup-placement-aggressive-threshold",
180 cl::desc("Instruction cutoff for aggressive tail duplication during "
181 "layout. Used at -O3. Tail merging during layout is forced to "
182 "have a threshold that won't conflict."),
183 cl::init(4), cl::Hidden);
184
185// Heuristic for tail duplication.
187 "tail-dup-placement-penalty",
188 cl::desc(
189 "Cost penalty for blocks that can avoid breaking CFG by copying. "
190 "Copying can increase fallthrough, but it also increases icache "
191 "pressure. This parameter controls the penalty to account for that. "
192 "Percent as integer."),
193 cl::init(2), cl::Hidden);
194
195// Heuristic for tail duplication if profile count is used in cost model.
197 "tail-dup-profile-percent-threshold",
198 cl::desc("If profile count information is used in tail duplication cost "
199 "model, the gained fall through number from tail duplication "
200 "should be at least this percent of hot count."),
201 cl::init(50), cl::Hidden);
202
203// Heuristic for triangle chains.
205 "triangle-chain-count",
206 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
207 "triangle tail duplication heuristic to kick in. 0 to disable."),
208 cl::init(2), cl::Hidden);
209
210// Use case: When block layout is visualized after MBP pass, the basic blocks
211// are labeled in layout order; meanwhile blocks could be numbered in a
212// different order. It's hard to map between the graph and pass output.
213// With this option on, the basic blocks are renumbered in function layout
214// order. For debugging only.
216 "renumber-blocks-before-view",
217 cl::desc(
218 "If true, basic blocks are re-numbered before MBP layout is printed "
219 "into a dot graph. Only used when a function is being printed."),
220 cl::init(false), cl::Hidden);
221
223 "ext-tsp-block-placement-max-blocks",
224 cl::desc("Maximum number of basic blocks in a function to run ext-TSP "
225 "block placement."),
226 cl::init(UINT_MAX), cl::Hidden);
227
228// Apply the ext-tsp algorithm minimizing the size of a binary.
229static cl::opt<bool>
230 ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
231 cl::desc("Use ext-tsp for size-aware block placement."));
232
233namespace llvm {
238
239// Internal option used to control BFI display only after MBP pass.
240// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
241// -view-block-layout-with-bfi=
243
244// Command line option to specify the name of the function for CFG dump
245// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
247} // namespace llvm
248
249namespace {
250
251class BlockChain;
252
253/// Type for our function-wide basic block -> block chain mapping.
254using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
255
256/// A chain of blocks which will be laid out contiguously.
257///
258/// This is the datastructure representing a chain of consecutive blocks that
259/// are profitable to layout together in order to maximize fallthrough
260/// probabilities and code locality. We also can use a block chain to represent
261/// a sequence of basic blocks which have some external (correctness)
262/// requirement for sequential layout.
263///
264/// Chains can be built around a single basic block and can be merged to grow
265/// them. They participate in a block-to-chain mapping, which is updated
266/// automatically as chains are merged together.
267class BlockChain {
268 /// The sequence of blocks belonging to this chain.
269 ///
270 /// This is the sequence of blocks for a particular chain. These will be laid
271 /// out in-order within the function.
273
274 /// A handle to the function-wide basic block to block chain mapping.
275 ///
276 /// This is retained in each block chain to simplify the computation of child
277 /// block chains for SCC-formation and iteration. We store the edges to child
278 /// basic blocks, and map them back to their associated chains using this
279 /// structure.
280 BlockToChainMapType &BlockToChain;
281
282public:
283 /// Construct a new BlockChain.
284 ///
285 /// This builds a new block chain representing a single basic block in the
286 /// function. It also registers itself as the chain that block participates
287 /// in with the BlockToChain mapping.
288 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
289 : Blocks(1, BB), BlockToChain(BlockToChain) {
290 assert(BB && "Cannot create a chain with a null basic block");
291 BlockToChain[BB] = this;
292 }
293
294 /// Iterator over blocks within the chain.
297
298 /// Beginning of blocks within the chain.
299 iterator begin() { return Blocks.begin(); }
300 const_iterator begin() const { return Blocks.begin(); }
301
302 /// End of blocks within the chain.
303 iterator end() { return Blocks.end(); }
304 const_iterator end() const { return Blocks.end(); }
305
306 bool remove(MachineBasicBlock *BB) {
307 for (iterator i = begin(); i != end(); ++i) {
308 if (*i == BB) {
309 Blocks.erase(i);
310 return true;
311 }
312 }
313 return false;
314 }
315
316 /// Merge a block chain into this one.
317 ///
318 /// This routine merges a block chain into this one. It takes care of forming
319 /// a contiguous sequence of basic blocks, updating the edge list, and
320 /// updating the block -> chain mapping. It does not free or tear down the
321 /// old chain, but the old chain's block list is no longer valid.
322 void merge(MachineBasicBlock *BB, BlockChain *Chain) {
323 assert(BB && "Can't merge a null block.");
324 assert(!Blocks.empty() && "Can't merge into an empty chain.");
325
326 // Fast path in case we don't have a chain already.
327 if (!Chain) {
328 assert(!BlockToChain[BB] &&
329 "Passed chain is null, but BB has entry in BlockToChain.");
330 Blocks.push_back(BB);
331 BlockToChain[BB] = this;
332 return;
333 }
334
335 assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
336 assert(Chain->begin() != Chain->end());
337
338 // Update the incoming blocks to point to this chain, and add them to the
339 // chain structure.
340 for (MachineBasicBlock *ChainBB : *Chain) {
341 Blocks.push_back(ChainBB);
342 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
343 BlockToChain[ChainBB] = this;
344 }
345 }
346
347#ifndef NDEBUG
348 /// Dump the blocks in this chain.
349 LLVM_DUMP_METHOD void dump() {
350 for (MachineBasicBlock *MBB : *this)
351 MBB->dump();
352 }
353#endif // NDEBUG
354
355 /// Count of predecessors of any block within the chain which have not
356 /// yet been scheduled. In general, we will delay scheduling this chain
357 /// until those predecessors are scheduled (or we find a sufficiently good
358 /// reason to override this heuristic.) Note that when forming loop chains,
359 /// blocks outside the loop are ignored and treated as if they were already
360 /// scheduled.
361 ///
362 /// Note: This field is reinitialized multiple times - once for each loop,
363 /// and then once for the function as a whole.
364 unsigned UnscheduledPredecessors = 0;
365};
366
367class MachineBlockPlacement {
368 /// A type for a block filter set.
369 using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
370
371 /// Pair struct containing basic block and taildup profitability
372 struct BlockAndTailDupResult {
373 MachineBasicBlock *BB = nullptr;
374 bool ShouldTailDup;
375 };
376
377 /// Triple struct containing edge weight and the edge.
378 struct WeightedEdge {
379 BlockFrequency Weight;
380 MachineBasicBlock *Src = nullptr;
381 MachineBasicBlock *Dest = nullptr;
382 };
383
384 /// work lists of blocks that are ready to be laid out
387
388 /// Edges that have already been computed as optimal.
389 DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
390
391 /// Machine Function
392 MachineFunction *F = nullptr;
393
394 /// A handle to the branch probability pass.
395 const MachineBranchProbabilityInfo *MBPI = nullptr;
396
397 /// A handle to the function-wide block frequency pass.
398 std::unique_ptr<MBFIWrapper> MBFI;
399
400 /// A handle to the loop info.
401 MachineLoopInfo *MLI = nullptr;
402
403 /// Preferred loop exit.
404 /// Member variable for convenience. It may be removed by duplication deep
405 /// in the call stack.
406 MachineBasicBlock *PreferredLoopExit = nullptr;
407
408 /// A handle to the target's instruction info.
409 const TargetInstrInfo *TII = nullptr;
410
411 /// A handle to the target's lowering info.
412 const TargetLoweringBase *TLI = nullptr;
413
414 /// A handle to the post dominator tree.
415 MachinePostDominatorTree *MPDT = nullptr;
416
417 ProfileSummaryInfo *PSI = nullptr;
418
419 // Tail merging is also determined based on
420 // whether structured CFG is required.
421 bool AllowTailMerge;
422
423 CodeGenOptLevel OptLevel;
424
425 /// Duplicator used to duplicate tails during placement.
426 ///
427 /// Placement decisions can open up new tail duplication opportunities, but
428 /// since tail duplication affects placement decisions of later blocks, it
429 /// must be done inline.
430 TailDuplicator TailDup;
431
432 /// Partial tail duplication threshold.
433 BlockFrequency DupThreshold;
434
435 unsigned TailDupSize;
436
437 /// True: use block profile count to compute tail duplication cost.
438 /// False: use block frequency to compute tail duplication cost.
439 bool UseProfileCount = false;
440
441 /// Allocator and owner of BlockChain structures.
442 ///
443 /// We build BlockChains lazily while processing the loop structure of
444 /// a function. To reduce malloc traffic, we allocate them using this
445 /// slab-like allocator, and destroy them after the pass completes. An
446 /// important guarantee is that this allocator produces stable pointers to
447 /// the chains.
448 SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
449
450 /// Function wide BasicBlock to BlockChain mapping.
451 ///
452 /// This mapping allows efficiently moving from any given basic block to the
453 /// BlockChain it participates in, if any. We use it to, among other things,
454 /// allow implicitly defining edges between chains as the existing edges
455 /// between basic blocks.
456 DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;
457
458#ifndef NDEBUG
459 /// The set of basic blocks that have terminators that cannot be fully
460 /// analyzed. These basic blocks cannot be re-ordered safely by
461 /// MachineBlockPlacement, and we must preserve physical layout of these
462 /// blocks and their successors through the pass.
463 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
464#endif
465
466 /// Get block profile count or frequency according to UseProfileCount.
467 /// The return value is used to model tail duplication cost.
468 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
469 if (UseProfileCount) {
470 auto Count = MBFI->getBlockProfileCount(BB);
471 if (Count)
472 return BlockFrequency(*Count);
473 else
474 return BlockFrequency(0);
475 } else
476 return MBFI->getBlockFreq(BB);
477 }
478
479 /// Scale the DupThreshold according to basic block size.
480 BlockFrequency scaleThreshold(MachineBasicBlock *BB);
481 void initTailDupThreshold();
482
483 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
484 /// if the count goes to 0, add them to the appropriate work list.
485 void markChainSuccessors(const BlockChain &Chain,
486 const MachineBasicBlock *LoopHeaderBB,
487 const BlockFilterSet *BlockFilter = nullptr);
488
489 /// Decrease the UnscheduledPredecessors count for a single block, and
490 /// if the count goes to 0, add them to the appropriate work list.
491 void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
492 const MachineBasicBlock *LoopHeaderBB,
493 const BlockFilterSet *BlockFilter = nullptr);
494
495 BranchProbability
496 collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain,
497 const BlockFilterSet *BlockFilter,
498 SmallVector<MachineBasicBlock *, 4> &Successors);
499 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
500 BlockFilterSet *BlockFilter);
501 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
502 MachineBasicBlock *BB,
503 BlockFilterSet *BlockFilter);
504 bool repeatedlyTailDuplicateBlock(
505 MachineBasicBlock *BB, MachineBasicBlock *&LPred,
506 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
507 BlockFilterSet *BlockFilter,
508 MachineFunction::iterator &PrevUnplacedBlockIt,
509 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt);
510 bool
511 maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
512 BlockChain &Chain, BlockFilterSet *BlockFilter,
513 MachineFunction::iterator &PrevUnplacedBlockIt,
514 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
515 bool &DuplicatedToLPred);
516 bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
517 const MachineBasicBlock *Succ,
518 const BlockChain &SuccChain,
519 BranchProbability SuccProb,
520 BranchProbability RealSuccProb,
521 const BlockChain &Chain,
522 const BlockFilterSet *BlockFilter);
523 BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
524 const BlockChain &Chain,
525 const BlockFilterSet *BlockFilter);
526 MachineBasicBlock *
527 selectBestCandidateBlock(const BlockChain &Chain,
528 SmallVectorImpl<MachineBasicBlock *> &WorkList);
529 MachineBasicBlock *
530 getFirstUnplacedBlock(const BlockChain &PlacedChain,
531 MachineFunction::iterator &PrevUnplacedBlockIt);
532 MachineBasicBlock *
533 getFirstUnplacedBlock(const BlockChain &PlacedChain,
534 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
535 const BlockFilterSet *BlockFilter);
536
537 /// Add a basic block to the work list if it is appropriate.
538 ///
539 /// If the optional parameter BlockFilter is provided, only MBB
540 /// present in the set will be added to the worklist. If nullptr
541 /// is provided, no filtering occurs.
542 void fillWorkLists(const MachineBasicBlock *MBB,
543 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
544 const BlockFilterSet *BlockFilter);
545
546 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
547 BlockFilterSet *BlockFilter = nullptr);
548 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
549 const MachineBasicBlock *OldTop);
550 bool hasViableTopFallthrough(const MachineBasicBlock *Top,
551 const BlockFilterSet &LoopBlockSet);
552 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
553 const BlockFilterSet &LoopBlockSet);
554 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
555 const MachineBasicBlock *OldTop,
556 const MachineBasicBlock *ExitBB,
557 const BlockFilterSet &LoopBlockSet);
558 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
559 const MachineLoop &L,
560 const BlockFilterSet &LoopBlockSet);
561 MachineBasicBlock *findBestLoopTop(const MachineLoop &L,
562 const BlockFilterSet &LoopBlockSet);
563 MachineBasicBlock *findBestLoopExit(const MachineLoop &L,
564 const BlockFilterSet &LoopBlockSet,
565 BlockFrequency &ExitFreq);
566 BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
567 void buildLoopChains(const MachineLoop &L);
568 void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
569 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
570 void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
571 const BlockFilterSet &LoopBlockSet);
572 void buildCFGChains();
573 void optimizeBranches();
574 void alignBlocks();
575 /// Returns true if a block should be tail-duplicated to increase fallthrough
576 /// opportunities.
577 bool shouldTailDuplicate(MachineBasicBlock *BB);
578 /// Check the edge frequencies to see if tail duplication will increase
579 /// fallthroughs.
580 bool isProfitableToTailDup(const MachineBasicBlock *BB,
581 const MachineBasicBlock *Succ,
582 BranchProbability QProb, const BlockChain &Chain,
583 const BlockFilterSet *BlockFilter);
584
585 /// Check for a trellis layout.
586 bool isTrellis(const MachineBasicBlock *BB,
587 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
588 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
589
590 /// Get the best successor given a trellis layout.
591 BlockAndTailDupResult getBestTrellisSuccessor(
592 const MachineBasicBlock *BB,
593 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
594 BranchProbability AdjustedSumProb, const BlockChain &Chain,
595 const BlockFilterSet *BlockFilter);
596
597 /// Get the best pair of non-conflicting edges.
598 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
599 const MachineBasicBlock *BB,
601
602 /// Returns true if a block can tail duplicate into all unplaced
603 /// predecessors. Filters based on loop.
604 bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
605 MachineBasicBlock *Succ,
606 const BlockChain &Chain,
607 const BlockFilterSet *BlockFilter);
608
609 /// Find chains of triangles to tail-duplicate where a global analysis works,
610 /// but a local analysis would not find them.
611 void precomputeTriangleChains();
612
613 /// Apply a post-processing step optimizing block placement.
614 void applyExtTsp(bool OptForSize);
615
616 /// Modify the existing block placement in the function and adjust all jumps.
617 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
618
619 /// Create a single CFG chain from the current block order.
620 void createCFGChainExtTsp();
621
622public:
623 MachineBlockPlacement(const MachineBranchProbabilityInfo *MBPI,
624 MachineLoopInfo *MLI, ProfileSummaryInfo *PSI,
625 std::unique_ptr<MBFIWrapper> MBFI,
626 MachinePostDominatorTree *MPDT, bool AllowTailMerge)
627 : MBPI(MBPI), MBFI(std::move(MBFI)), MLI(MLI), MPDT(MPDT), PSI(PSI),
628 AllowTailMerge(AllowTailMerge) {};
629
630 bool run(MachineFunction &F);
631
632 static bool allowTailDupPlacement(MachineFunction &MF) {
634 }
635};
636
637class MachineBlockPlacementLegacy : public MachineFunctionPass {
638public:
639 static char ID; // Pass identification, replacement for typeid
640
641 MachineBlockPlacementLegacy() : MachineFunctionPass(ID) {}
642
643 bool runOnMachineFunction(MachineFunction &MF) override {
644 if (skipFunction(MF.getFunction()))
645 return false;
646
647 auto *MBPI =
648 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
649 auto MBFI = std::make_unique<MBFIWrapper>(
650 getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
651 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
652 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
653 ? &getAnalysis<MachinePostDominatorTreeWrapperPass>()
654 .getPostDomTree()
655 : nullptr;
656 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
657 auto *PassConfig = &getAnalysis<TargetPassConfig>();
658 bool AllowTailMerge = PassConfig->getEnableTailMerge();
659 return MachineBlockPlacement(MBPI, MLI, PSI, std::move(MBFI), MPDT,
660 AllowTailMerge)
661 .run(MF);
662 }
663
664 void getAnalysisUsage(AnalysisUsage &AU) const override {
665 AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
666 AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
668 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
669 AU.addRequired<MachineLoopInfoWrapperPass>();
670 AU.addRequired<ProfileSummaryInfoWrapperPass>();
671 AU.addRequired<TargetPassConfig>();
673 }
674};
675
676} // end anonymous namespace
677
678char MachineBlockPlacementLegacy::ID = 0;
679
680char &llvm::MachineBlockPlacementID = MachineBlockPlacementLegacy::ID;
681
682INITIALIZE_PASS_BEGIN(MachineBlockPlacementLegacy, DEBUG_TYPE,
683 "Branch Probability Basic Block Placement", false, false)
689INITIALIZE_PASS_END(MachineBlockPlacementLegacy, DEBUG_TYPE,
690 "Branch Probability Basic Block Placement", false, false)
691
692#ifndef NDEBUG
693/// Helper to print the name of a MBB.
694///
695/// Only used by debug logging.
696static std::string getBlockName(const MachineBasicBlock *BB) {
697 std::string Result;
698 raw_string_ostream OS(Result);
699 OS << printMBBReference(*BB);
700 OS << " ('" << BB->getName() << "')";
701 return Result;
702}
703#endif
704
705/// Mark a chain's successors as having one fewer preds.
706///
707/// When a chain is being merged into the "placed" chain, this routine will
708/// quickly walk the successors of each block in the chain and mark them as
709/// having one fewer active predecessor. It also adds any successors of this
710/// chain which reach the zero-predecessor state to the appropriate worklist.
711void MachineBlockPlacement::markChainSuccessors(
712 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
713 const BlockFilterSet *BlockFilter) {
714 // Walk all the blocks in this chain, marking their successors as having
715 // a predecessor placed.
716 for (MachineBasicBlock *MBB : Chain) {
717 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
718 }
719}
720
721/// Mark a single block's successors as having one fewer preds.
722///
723/// Under normal circumstances, this is only called by markChainSuccessors,
724/// but if a block that was to be placed is completely tail-duplicated away,
725/// and was duplicated into the chain end, we need to redo markBlockSuccessors
726/// for just that block.
727void MachineBlockPlacement::markBlockSuccessors(
728 const BlockChain &Chain, const MachineBasicBlock *MBB,
729 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
730 // Add any successors for which this is the only un-placed in-loop
731 // predecessor to the worklist as a viable candidate for CFG-neutral
732 // placement. No subsequent placement of this block will violate the CFG
733 // shape, so we get to use heuristics to choose a favorable placement.
734 for (MachineBasicBlock *Succ : MBB->successors()) {
735 if (BlockFilter && !BlockFilter->count(Succ))
736 continue;
737 BlockChain &SuccChain = *BlockToChain[Succ];
738 // Disregard edges within a fixed chain, or edges to the loop header.
739 if (&Chain == &SuccChain || Succ == LoopHeaderBB)
740 continue;
741
742 // This is a cross-chain edge that is within the loop, so decrement the
743 // loop predecessor count of the destination chain.
744 if (SuccChain.UnscheduledPredecessors == 0 ||
745 --SuccChain.UnscheduledPredecessors > 0)
746 continue;
747
748 auto *NewBB = *SuccChain.begin();
749 if (NewBB->isEHPad())
750 EHPadWorkList.push_back(NewBB);
751 else
752 BlockWorkList.push_back(NewBB);
753 }
754}
755
756/// This helper function collects the set of successors of block
757/// \p BB that are allowed to be its layout successors, and return
758/// the total branch probability of edges from \p BB to those
759/// blocks.
760BranchProbability MachineBlockPlacement::collectViableSuccessors(
761 const MachineBasicBlock *BB, const BlockChain &Chain,
762 const BlockFilterSet *BlockFilter,
763 SmallVector<MachineBasicBlock *, 4> &Successors) {
764 // Adjust edge probabilities by excluding edges pointing to blocks that is
765 // either not in BlockFilter or is already in the current chain. Consider the
766 // following CFG:
767 //
768 // --->A
769 // | / \
770 // | B C
771 // | \ / \
772 // ----D E
773 //
774 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
775 // A->C is chosen as a fall-through, D won't be selected as a successor of C
776 // due to CFG constraint (the probability of C->D is not greater than
777 // HotProb to break topo-order). If we exclude E that is not in BlockFilter
778 // when calculating the probability of C->D, D will be selected and we
779 // will get A C D B as the layout of this loop.
780 auto AdjustedSumProb = BranchProbability::getOne();
781 for (MachineBasicBlock *Succ : BB->successors()) {
782 bool SkipSucc = false;
783 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
784 SkipSucc = true;
785 } else {
786 BlockChain *SuccChain = BlockToChain[Succ];
787 if (SuccChain == &Chain) {
788 SkipSucc = true;
789 } else if (Succ != *SuccChain->begin()) {
790 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
791 << " -> Mid chain!\n");
792 continue;
793 }
794 }
795 if (SkipSucc)
796 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
797 else
798 Successors.push_back(Succ);
799 }
800
801 return AdjustedSumProb;
802}
803
804/// The helper function returns the branch probability that is adjusted
805/// or normalized over the new total \p AdjustedSumProb.
806static BranchProbability
808 BranchProbability AdjustedSumProb) {
809 BranchProbability SuccProb;
810 uint32_t SuccProbN = OrigProb.getNumerator();
811 uint32_t SuccProbD = AdjustedSumProb.getNumerator();
812 if (SuccProbN >= SuccProbD)
813 SuccProb = BranchProbability::getOne();
814 else
815 SuccProb = BranchProbability(SuccProbN, SuccProbD);
816
817 return SuccProb;
818}
819
820/// Check if \p BB has exactly the successors in \p Successors.
821static bool
824 if (BB.succ_size() != Successors.size())
825 return false;
826 // We don't want to count self-loops
827 if (Successors.count(&BB))
828 return false;
829 for (MachineBasicBlock *Succ : BB.successors())
830 if (!Successors.count(Succ))
831 return false;
832 return true;
833}
834
835/// Check if a block should be tail duplicated to increase fallthrough
836/// opportunities.
837/// \p BB Block to check.
838bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
839 // Blocks with single successors don't create additional fallthrough
840 // opportunities. Don't duplicate them. TODO: When conditional exits are
841 // analyzable, allow them to be duplicated.
842 bool IsSimple = TailDup.isSimpleBB(BB);
843
844 if (BB->succ_size() == 1)
845 return false;
846 return TailDup.shouldTailDuplicate(IsSimple, *BB);
847}
848
849/// Compare 2 BlockFrequency's with a small penalty for \p A.
850/// In order to be conservative, we apply a X% penalty to account for
851/// increased icache pressure and static heuristics. For small frequencies
852/// we use only the numerators to improve accuracy. For simplicity, we assume
853/// the penalty is less than 100%
854/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
856 BlockFrequency EntryFreq) {
857 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
858 BlockFrequency Gain = A - B;
859 return (Gain / ThresholdProb) >= EntryFreq;
860}
861
862/// Check the edge frequencies to see if tail duplication will increase
863/// fallthroughs. It only makes sense to call this function when
864/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
865/// always locally profitable if we would have picked \p Succ without
866/// considering duplication.
867bool MachineBlockPlacement::isProfitableToTailDup(
868 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
869 BranchProbability QProb, const BlockChain &Chain,
870 const BlockFilterSet *BlockFilter) {
871 // We need to do a probability calculation to make sure this is profitable.
872 // First: does succ have a successor that post-dominates? This affects the
873 // calculation. The 2 relevant cases are:
874 // BB BB
875 // | \Qout | \Qout
876 // P| C |P C
877 // = C' = C'
878 // | /Qin | /Qin
879 // | / | /
880 // Succ Succ
881 // / \ | \ V
882 // U/ =V |U \
883 // / \ = D
884 // D E | /
885 // | /
886 // |/
887 // PDom
888 // '=' : Branch taken for that CFG edge
889 // In the second case, Placing Succ while duplicating it into C prevents the
890 // fallthrough of Succ into either D or PDom, because they now have C as an
891 // unplaced predecessor
892
893 // Start by figuring out which case we fall into
894 MachineBasicBlock *PDom = nullptr;
895 SmallVector<MachineBasicBlock *, 4> SuccSuccs;
896 // Only scan the relevant successors
897 auto AdjustedSuccSumProb =
898 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
899 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
900 auto BBFreq = MBFI->getBlockFreq(BB);
901 auto SuccFreq = MBFI->getBlockFreq(Succ);
902 BlockFrequency P = BBFreq * PProb;
903 BlockFrequency Qout = BBFreq * QProb;
904 BlockFrequency EntryFreq = MBFI->getEntryFreq();
905 // If there are no more successors, it is profitable to copy, as it strictly
906 // increases fallthrough.
907 if (SuccSuccs.size() == 0)
908 return greaterWithBias(P, Qout, EntryFreq);
909
910 auto BestSuccSucc = BranchProbability::getZero();
911 // Find the PDom or the best Succ if no PDom exists.
912 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
913 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
914 if (Prob > BestSuccSucc)
915 BestSuccSucc = Prob;
916 if (PDom == nullptr)
917 if (MPDT->dominates(SuccSucc, Succ)) {
918 PDom = SuccSucc;
919 break;
920 }
921 }
922 // For the comparisons, we need to know Succ's best incoming edge that isn't
923 // from BB.
924 auto SuccBestPred = BlockFrequency(0);
925 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
926 if (SuccPred == Succ || SuccPred == BB ||
927 BlockToChain[SuccPred] == &Chain ||
928 (BlockFilter && !BlockFilter->count(SuccPred)))
929 continue;
930 auto Freq =
931 MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ);
932 if (Freq > SuccBestPred)
933 SuccBestPred = Freq;
934 }
935 // Qin is Succ's best unplaced incoming edge that isn't BB
936 BlockFrequency Qin = SuccBestPred;
937 // If it doesn't have a post-dominating successor, here is the calculation:
938 // BB BB
939 // | \Qout | \
940 // P| C | =
941 // = C' | C
942 // | /Qin | |
943 // | / | C' (+Succ)
944 // Succ Succ /|
945 // / \ | \/ |
946 // U/ =V | == |
947 // / \ | / \|
948 // D E D E
949 // '=' : Branch taken for that CFG edge
950 // Cost in the first case is: P + V
951 // For this calculation, we always assume P > Qout. If Qout > P
952 // The result of this function will be ignored at the caller.
953 // Let F = SuccFreq - Qin
954 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
955
956 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
957 BranchProbability UProb = BestSuccSucc;
958 BranchProbability VProb = AdjustedSuccSumProb - UProb;
959 BlockFrequency F = SuccFreq - Qin;
960 BlockFrequency V = SuccFreq * VProb;
961 BlockFrequency QinU = std::min(Qin, F) * UProb;
962 BlockFrequency BaseCost = P + V;
963 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
964 return greaterWithBias(BaseCost, DupCost, EntryFreq);
965 }
966 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
967 BranchProbability VProb = AdjustedSuccSumProb - UProb;
968 BlockFrequency U = SuccFreq * UProb;
969 BlockFrequency V = SuccFreq * VProb;
970 BlockFrequency F = SuccFreq - Qin;
971 // If there is a post-dominating successor, here is the calculation:
972 // BB BB BB BB
973 // | \Qout | \ | \Qout | \
974 // |P C | = |P C | =
975 // = C' |P C = C' |P C
976 // | /Qin | | | /Qin | |
977 // | / | C' (+Succ) | / | C' (+Succ)
978 // Succ Succ /| Succ Succ /|
979 // | \ V | \/ | | \ V | \/ |
980 // |U \ |U /\ =? |U = |U /\ |
981 // = D = = =?| | D | = =|
982 // | / |/ D | / |/ D
983 // | / | / | = | /
984 // |/ | / |/ | =
985 // Dom Dom Dom Dom
986 // '=' : Branch taken for that CFG edge
987 // The cost for taken branches in the first case is P + U
988 // Let F = SuccFreq - Qin
989 // The cost in the second case (assuming independence), given the layout:
990 // BB, Succ, (C+Succ), D, Dom or the layout:
991 // BB, Succ, D, Dom, (C+Succ)
992 // is Qout + max(F, Qin) * U + min(F, Qin)
993 // compare P + U vs Qout + P * U + Qin.
994 //
995 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
996 //
997 // For the 3rd case, the cost is P + 2 * V
998 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
999 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
1000 if (UProb > AdjustedSuccSumProb / 2 &&
1001 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
1002 Chain, BlockFilter))
1003 // Cases 3 & 4
1004 return greaterWithBias(
1005 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
1006 EntryFreq);
1007 // Cases 1 & 2
1008 return greaterWithBias((P + U),
1009 (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
1010 std::max(Qin, F) * UProb),
1011 EntryFreq);
1012}
1013
1014/// Check for a trellis layout. \p BB is the upper part of a trellis if its
1015/// successors form the lower part of a trellis. A successor set S forms the
1016/// lower part of a trellis if all of the predecessors of S are either in S or
1017/// have all of S as successors. We ignore trellises where BB doesn't have 2
1018/// successors because for fewer than 2, it's trivial, and for 3 or greater they
1019/// are very uncommon and complex to compute optimally. Allowing edges within S
1020/// is not strictly a trellis, but the same algorithm works, so we allow it.
1021bool MachineBlockPlacement::isTrellis(
1022 const MachineBasicBlock *BB,
1023 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1024 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1025 // Technically BB could form a trellis with branching factor higher than 2.
1026 // But that's extremely uncommon.
1027 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
1028 return false;
1029
1030 SmallPtrSet<const MachineBasicBlock *, 2> Successors(llvm::from_range,
1031 BB->successors());
1032 // To avoid reviewing the same predecessors twice.
1033 SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
1034
1035 for (MachineBasicBlock *Succ : ViableSuccs) {
1036 // Compile-time optimization: runtime is quadratic in the number of
1037 // predecessors. For such uncommon cases, exit early.
1038 if (Succ->pred_size() > PredecessorLimit)
1039 return false;
1040
1041 int PredCount = 0;
1042 for (auto *SuccPred : Succ->predecessors()) {
1043 // Allow triangle successors, but don't count them.
1044 if (Successors.count(SuccPred)) {
1045 // Make sure that it is actually a triangle.
1046 for (MachineBasicBlock *CheckSucc : SuccPred->successors())
1047 if (!Successors.count(CheckSucc))
1048 return false;
1049 continue;
1050 }
1051 const BlockChain *PredChain = BlockToChain[SuccPred];
1052 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
1053 PredChain == &Chain || PredChain == BlockToChain[Succ])
1054 continue;
1055 ++PredCount;
1056 // Perform the successor check only once.
1057 if (!SeenPreds.insert(SuccPred).second)
1058 continue;
1059 if (!hasSameSuccessors(*SuccPred, Successors))
1060 return false;
1061 }
1062 // If one of the successors has only BB as a predecessor, it is not a
1063 // trellis.
1064 if (PredCount < 1)
1065 return false;
1066 }
1067 return true;
1068}
1069
1070/// Pick the highest total weight pair of edges that can both be laid out.
1071/// The edges in \p Edges[0] are assumed to have a different destination than
1072/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1073/// the individual highest weight edges to the 2 different destinations, or in
1074/// case of a conflict, one of them should be replaced with a 2nd best edge.
1075std::pair<MachineBlockPlacement::WeightedEdge,
1076 MachineBlockPlacement::WeightedEdge>
1077MachineBlockPlacement::getBestNonConflictingEdges(
1078 const MachineBasicBlock *BB,
1080 Edges) {
1081 // Sort the edges, and then for each successor, find the best incoming
1082 // predecessor. If the best incoming predecessors aren't the same,
1083 // then that is clearly the best layout. If there is a conflict, one of the
1084 // successors will have to fallthrough from the second best predecessor. We
1085 // compare which combination is better overall.
1086
1087 // Sort for highest frequency.
1088 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1089
1090 llvm::stable_sort(Edges[0], Cmp);
1091 llvm::stable_sort(Edges[1], Cmp);
1092 auto BestA = Edges[0].begin();
1093 auto BestB = Edges[1].begin();
1094 // Arrange for the correct answer to be in BestA and BestB
1095 // If the 2 best edges don't conflict, the answer is already there.
1096 if (BestA->Src == BestB->Src) {
1097 // Compare the total fallthrough of (Best + Second Best) for both pairs
1098 auto SecondBestA = std::next(BestA);
1099 auto SecondBestB = std::next(BestB);
1100 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1101 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1102 if (BestAScore < BestBScore)
1103 BestA = SecondBestA;
1104 else
1105 BestB = SecondBestB;
1106 }
1107 // Arrange for the BB edge to be in BestA if it exists.
1108 if (BestB->Src == BB)
1109 std::swap(BestA, BestB);
1110 return std::make_pair(*BestA, *BestB);
1111}
1112
1113/// Get the best successor from \p BB based on \p BB being part of a trellis.
1114/// We only handle trellises with 2 successors, so the algorithm is
1115/// straightforward: Find the best pair of edges that don't conflict. We find
1116/// the best incoming edge for each successor in the trellis. If those conflict,
1117/// we consider which of them should be replaced with the second best.
1118/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1119/// comes from \p BB, it will be in \p BestEdges[0]
1120MachineBlockPlacement::BlockAndTailDupResult
1121MachineBlockPlacement::getBestTrellisSuccessor(
1122 const MachineBasicBlock *BB,
1123 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1124 BranchProbability AdjustedSumProb, const BlockChain &Chain,
1125 const BlockFilterSet *BlockFilter) {
1126
1127 BlockAndTailDupResult Result = {nullptr, false};
1128 SmallPtrSet<const MachineBasicBlock *, 4> Successors(llvm::from_range,
1129 BB->successors());
1130
1131 // We assume size 2 because it's common. For general n, we would have to do
1132 // the Hungarian algorithm, but it's not worth the complexity because more
1133 // than 2 successors is fairly uncommon, and a trellis even more so.
1134 if (Successors.size() != 2 || ViableSuccs.size() != 2)
1135 return Result;
1136
1137 // Collect the edge frequencies of all edges that form the trellis.
1139 int SuccIndex = 0;
1140 for (auto *Succ : ViableSuccs) {
1141 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1142 // Skip any placed predecessors that are not BB
1143 if (SuccPred != BB) {
1144 if (BlockFilter && !BlockFilter->count(SuccPred))
1145 continue;
1146 const BlockChain *SuccPredChain = BlockToChain[SuccPred];
1147 if (SuccPredChain == &Chain || SuccPredChain == BlockToChain[Succ])
1148 continue;
1149 }
1150 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1151 MBPI->getEdgeProbability(SuccPred, Succ);
1152 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1153 }
1154 ++SuccIndex;
1155 }
1156
1157 // Pick the best combination of 2 edges from all the edges in the trellis.
1158 WeightedEdge BestA, BestB;
1159 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1160
1161 if (BestA.Src != BB) {
1162 // If we have a trellis, and BB doesn't have the best fallthrough edges,
1163 // we shouldn't choose any successor. We've already looked and there's a
1164 // better fallthrough edge for all the successors.
1165 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1166 return Result;
1167 }
1168
1169 // Did we pick the triangle edge? If tail-duplication is profitable, do
1170 // that instead. Otherwise merge the triangle edge now while we know it is
1171 // optimal.
1172 if (BestA.Dest == BestB.Src) {
1173 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1174 // would be better.
1175 MachineBasicBlock *Succ1 = BestA.Dest;
1176 MachineBasicBlock *Succ2 = BestB.Dest;
1177 // Check to see if tail-duplication would be profitable.
1178 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ2) &&
1179 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1180 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1181 Chain, BlockFilter)) {
1182 LLVM_DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
1183 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1184 dbgs() << " Selected: " << getBlockName(Succ2)
1185 << ", probability: " << Succ2Prob
1186 << " (Tail Duplicate)\n");
1187 Result.BB = Succ2;
1188 Result.ShouldTailDup = true;
1189 return Result;
1190 }
1191 }
1192 // We have already computed the optimal edge for the other side of the
1193 // trellis.
1194 ComputedEdges[BestB.Src] = {BestB.Dest, false};
1195
1196 auto TrellisSucc = BestA.Dest;
1197 LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
1198 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1199 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1200 << ", probability: " << SuccProb << " (Trellis)\n");
1201 Result.BB = TrellisSucc;
1202 return Result;
1203}
1204
1205/// When the option allowTailDupPlacement() is on, this method checks if the
1206/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1207/// into all of its unplaced, unfiltered predecessors, that are not BB.
1208bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1209 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1210 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1211 if (!shouldTailDuplicate(Succ))
1212 return false;
1213
1214 // The result of canTailDuplicate.
1215 bool Duplicate = true;
1216 // Number of possible duplication.
1217 unsigned int NumDup = 0;
1218
1219 // For CFG checking.
1220 SmallPtrSet<const MachineBasicBlock *, 4> Successors(llvm::from_range,
1221 BB->successors());
1222 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1223 // Make sure all unplaced and unfiltered predecessors can be
1224 // tail-duplicated into.
1225 // Skip any blocks that are already placed or not in this loop.
1226 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) ||
1227 (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
1228 continue;
1229 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1230 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1231 // This will result in a trellis after tail duplication, so we don't
1232 // need to copy Succ into this predecessor. In the presence
1233 // of a trellis tail duplication can continue to be profitable.
1234 // For example:
1235 // A A
1236 // |\ |\
1237 // | \ | \
1238 // | C | C+BB
1239 // | / | |
1240 // |/ | |
1241 // BB => BB |
1242 // |\ |\/|
1243 // | \ |/\|
1244 // | D | D
1245 // | / | /
1246 // |/ |/
1247 // Succ Succ
1248 //
1249 // After BB was duplicated into C, the layout looks like the one on the
1250 // right. BB and C now have the same successors. When considering
1251 // whether Succ can be duplicated into all its unplaced predecessors, we
1252 // ignore C.
1253 // We can do this because C already has a profitable fallthrough, namely
1254 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1255 // duplication and for this test.
1256 //
1257 // This allows trellises to be laid out in 2 separate chains
1258 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1259 // because it allows the creation of 2 fallthrough paths with links
1260 // between them, and we correctly identify the best layout for these
1261 // CFGs. We want to extend trellises that the user created in addition
1262 // to trellises created by tail-duplication, so we just look for the
1263 // CFG.
1264 continue;
1265 Duplicate = false;
1266 continue;
1267 }
1268 NumDup++;
1269 }
1270
1271 // No possible duplication in current filter set.
1272 if (NumDup == 0)
1273 return false;
1274
1275 // If profile information is available, findDuplicateCandidates can do more
1276 // precise benefit analysis.
1277 if (F->getFunction().hasProfileData())
1278 return true;
1279
1280 // This is mainly for function exit BB.
1281 // The integrated tail duplication is really designed for increasing
1282 // fallthrough from predecessors from Succ to its successors. We may need
1283 // other machanism to handle different cases.
1284 if (Succ->succ_empty())
1285 return true;
1286
1287 // Plus the already placed predecessor.
1288 NumDup++;
1289
1290 // If the duplication candidate has more unplaced predecessors than
1291 // successors, the extra duplication can't bring more fallthrough.
1292 //
1293 // Pred1 Pred2 Pred3
1294 // \ | /
1295 // \ | /
1296 // \ | /
1297 // Dup
1298 // / \
1299 // / \
1300 // Succ1 Succ2
1301 //
1302 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1303 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1304 // but the duplication into Pred3 can't increase fallthrough.
1305 //
1306 // A small number of extra duplication may not hurt too much. We need a better
1307 // heuristic to handle it.
1308 if ((NumDup > Succ->succ_size()) || !Duplicate)
1309 return false;
1310
1311 return true;
1312}
1313
1314/// Find chains of triangles where we believe it would be profitable to
1315/// tail-duplicate them all, but a local analysis would not find them.
1316/// There are 3 ways this can be profitable:
1317/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1318/// longer chains)
1319/// 2) The chains are statically correlated. Branch probabilities have a very
1320/// U-shaped distribution.
1321/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1322/// If the branches in a chain are likely to be from the same side of the
1323/// distribution as their predecessor, but are independent at runtime, this
1324/// transformation is profitable. (Because the cost of being wrong is a small
1325/// fixed cost, unlike the standard triangle layout where the cost of being
1326/// wrong scales with the # of triangles.)
1327/// 3) The chains are dynamically correlated. If the probability that a previous
1328/// branch was taken positively influences whether the next branch will be
1329/// taken
1330/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1331void MachineBlockPlacement::precomputeTriangleChains() {
1332 struct TriangleChain {
1333 std::vector<MachineBasicBlock *> Edges;
1334
1335 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1336 : Edges({src, dst}) {}
1337
1338 void append(MachineBasicBlock *dst) {
1339 assert(getKey()->isSuccessor(dst) &&
1340 "Attempting to append a block that is not a successor.");
1341 Edges.push_back(dst);
1342 }
1343
1344 unsigned count() const { return Edges.size() - 1; }
1345
1346 MachineBasicBlock *getKey() const { return Edges.back(); }
1347 };
1348
1349 if (TriangleChainCount == 0)
1350 return;
1351
1352 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1353 // Map from last block to the chain that contains it. This allows us to extend
1354 // chains as we find new triangles.
1355 DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
1356 for (MachineBasicBlock &BB : *F) {
1357 // If BB doesn't have 2 successors, it doesn't start a triangle.
1358 if (BB.succ_size() != 2)
1359 continue;
1360 MachineBasicBlock *PDom = nullptr;
1361 for (MachineBasicBlock *Succ : BB.successors()) {
1362 if (!MPDT->dominates(Succ, &BB))
1363 continue;
1364 PDom = Succ;
1365 break;
1366 }
1367 // If BB doesn't have a post-dominating successor, it doesn't form a
1368 // triangle.
1369 if (PDom == nullptr)
1370 continue;
1371 // If PDom has a hint that it is low probability, skip this triangle.
1372 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1373 continue;
1374 // If PDom isn't eligible for duplication, this isn't the kind of triangle
1375 // we're looking for.
1376 if (!shouldTailDuplicate(PDom))
1377 continue;
1378 bool CanTailDuplicate = true;
1379 // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1380 // isn't the kind of triangle we're looking for.
1381 for (MachineBasicBlock *Pred : PDom->predecessors()) {
1382 if (Pred == &BB)
1383 continue;
1384 if (!TailDup.canTailDuplicate(PDom, Pred)) {
1385 CanTailDuplicate = false;
1386 break;
1387 }
1388 }
1389 // If we can't tail-duplicate PDom to its predecessors, then skip this
1390 // triangle.
1391 if (!CanTailDuplicate)
1392 continue;
1393
1394 // Now we have an interesting triangle. Insert it if it's not part of an
1395 // existing chain.
1396 // Note: This cannot be replaced with a call insert() or emplace() because
1397 // the find key is BB, but the insert/emplace key is PDom.
1398 auto Found = TriangleChainMap.find(&BB);
1399 // If it is, remove the chain from the map, grow it, and put it back in the
1400 // map with the end as the new key.
1401 if (Found != TriangleChainMap.end()) {
1402 TriangleChain Chain = std::move(Found->second);
1403 TriangleChainMap.erase(Found);
1404 Chain.append(PDom);
1405 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1406 } else {
1407 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1408 assert(InsertResult.second && "Block seen twice.");
1409 (void)InsertResult;
1410 }
1411 }
1412
1413 // Iterating over a DenseMap is safe here, because the only thing in the body
1414 // of the loop is inserting into another DenseMap (ComputedEdges).
1415 // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1416 for (auto &ChainPair : TriangleChainMap) {
1417 TriangleChain &Chain = ChainPair.second;
1418 // Benchmarking has shown that due to branch correlation duplicating 2 or
1419 // more triangles is profitable, despite the calculations assuming
1420 // independence.
1421 if (Chain.count() < TriangleChainCount)
1422 continue;
1423 MachineBasicBlock *dst = Chain.Edges.back();
1424 Chain.Edges.pop_back();
1425 for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1426 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1427 << getBlockName(dst)
1428 << " as pre-computed based on triangles.\n");
1429
1430 auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1431 assert(InsertResult.second && "Block seen twice.");
1432 (void)InsertResult;
1433
1434 dst = src;
1435 }
1436 }
1437}
1438
1439// When profile is not present, return the StaticLikelyProb.
1440// When profile is available, we need to handle the triangle-shape CFG.
1441static BranchProbability
1443 if (!BB->getParent()->getFunction().hasProfileData())
1445 if (BB->succ_size() == 2) {
1446 const MachineBasicBlock *Succ1 = *BB->succ_begin();
1447 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1448 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1449 /* See case 1 below for the cost analysis. For BB->Succ to
1450 * be taken with smaller cost, the following needs to hold:
1451 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1452 * So the threshold T in the calculation below
1453 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1454 * So T / (1 - T) = 2, Yielding T = 2/3
1455 *
1456 * Then remap the user-controlled ProfileLikelyProb into
1457 * a triangle-specific threshold T.
1458 * T = (2/3) * (ProfileLikelyProb / 50)
1459 * = (2 * ProfileLikelyProb) / 150
1460 * This preserves T = 2/3 at ProfileLikelyProb = 50.
1461 * The result is capped at 1.
1462 */
1463 return BranchProbability(ProfileLikelyProb, 150) * 2;
1464 }
1465 }
1467}
1468
1469/// Checks to see if the layout candidate block \p Succ has a better layout
1470/// predecessor than \c BB. If yes, returns true.
1471/// \p SuccProb: The probability adjusted for only remaining blocks.
1472/// Only used for logging
1473/// \p RealSuccProb: The un-adjusted probability.
1474/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1475/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1476/// considered
1477bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1478 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1479 const BlockChain &SuccChain, BranchProbability SuccProb,
1480 BranchProbability RealSuccProb, const BlockChain &Chain,
1481 const BlockFilterSet *BlockFilter) {
1482
1483 // There isn't a better layout when there are no unscheduled predecessors.
1484 if (SuccChain.UnscheduledPredecessors == 0)
1485 return false;
1486
1487 // Compile-time optimization: runtime is quadratic in the number of
1488 // predecessors. For such uncommon cases, exit early.
1489 if (Succ->pred_size() > PredecessorLimit)
1490 return false;
1491
1492 // There are two basic scenarios here:
1493 // -------------------------------------
1494 // Case 1: triangular shape CFG (if-then):
1495 // BB
1496 // | \
1497 // | \
1498 // | Pred
1499 // | /
1500 // Succ
1501 // In this case, we are evaluating whether to select edge -> Succ, e.g.
1502 // set Succ as the layout successor of BB. Picking Succ as BB's
1503 // successor breaks the CFG constraints (FIXME: define these constraints).
1504 // With this layout, Pred BB
1505 // is forced to be outlined, so the overall cost will be cost of the
1506 // branch taken from BB to Pred, plus the cost of back taken branch
1507 // from Pred to Succ, as well as the additional cost associated
1508 // with the needed unconditional jump instruction from Pred To Succ.
1509
1510 // The cost of the topological order layout is the taken branch cost
1511 // from BB to Succ, so to make BB->Succ a viable candidate, the following
1512 // must hold:
1513 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1514 // < freq(BB->Succ) * taken_branch_cost.
1515 // Ignoring unconditional jump cost, we get
1516 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1517 // prob(BB->Succ) > 2 * prob(BB->Pred)
1518 //
1519 // When real profile data is available, we can precisely compute the
1520 // probability threshold that is needed for edge BB->Succ to be considered.
1521 // Without profile data, the heuristic requires the branch bias to be
1522 // a lot larger to make sure the signal is very strong (e.g. 80% default).
1523 // -----------------------------------------------------------------
1524 // Case 2: diamond like CFG (if-then-else):
1525 // S
1526 // / \
1527 // | \
1528 // BB Pred
1529 // \ /
1530 // Succ
1531 // ..
1532 //
1533 // The current block is BB and edge BB->Succ is now being evaluated.
1534 // Note that edge S->BB was previously already selected because
1535 // prob(S->BB) > prob(S->Pred).
1536 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1537 // choose Pred, we will have a topological ordering as shown on the left
1538 // in the picture below. If we choose Succ, we have the solution as shown
1539 // on the right:
1540 //
1541 // topo-order:
1542 //
1543 // S----- ---S
1544 // | | | |
1545 // ---BB | | BB
1546 // | | | |
1547 // | Pred-- | Succ--
1548 // | | | |
1549 // ---Succ ---Pred--
1550 //
1551 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1552 // = freq(S->Pred) + freq(S->BB)
1553 //
1554 // If we have profile data (i.e, branch probabilities can be trusted), the
1555 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1556 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1557 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1558 // means the cost of topological order is greater.
1559 // When profile data is not available, however, we need to be more
1560 // conservative. If the branch prediction is wrong, breaking the topo-order
1561 // will actually yield a layout with large cost. For this reason, we need
1562 // strong biased branch at block S with Prob(S->BB) in order to select
1563 // BB->Succ. This is equivalent to looking the CFG backward with backward
1564 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1565 // profile data).
1566 // --------------------------------------------------------------------------
1567 // Case 3: forked diamond
1568 // S
1569 // / \
1570 // / \
1571 // BB Pred
1572 // | \ / |
1573 // | \ / |
1574 // | X |
1575 // | / \ |
1576 // | / \ |
1577 // S1 S2
1578 //
1579 // The current block is BB and edge BB->S1 is now being evaluated.
1580 // As above S->BB was already selected because
1581 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1582 //
1583 // topo-order:
1584 //
1585 // S-------| ---S
1586 // | | | |
1587 // ---BB | | BB
1588 // | | | |
1589 // | Pred----| | S1----
1590 // | | | |
1591 // --(S1 or S2) ---Pred--
1592 // |
1593 // S2
1594 //
1595 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1596 // + min(freq(Pred->S1), freq(Pred->S2))
1597 // Non-topo-order cost:
1598 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1599 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1600 // is 0. Then the non topo layout is better when
1601 // freq(S->Pred) < freq(BB->S1).
1602 // This is exactly what is checked below.
1603 // Note there are other shapes that apply (Pred may not be a single block,
1604 // but they all fit this general pattern.)
1605 BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
1606
1607 // Make sure that a hot successor doesn't have a globally more
1608 // important predecessor.
1609 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1610 bool BadCFGConflict = false;
1611
1612 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1613 BlockChain *PredChain = BlockToChain[Pred];
1614 if (Pred == Succ || PredChain == &SuccChain ||
1615 (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain ||
1616 Pred != *std::prev(PredChain->end()) ||
1617 // This check is redundant except for look ahead. This function is
1618 // called for lookahead by isProfitableToTailDup when BB hasn't been
1619 // placed yet.
1620 (Pred == BB))
1621 continue;
1622 // Do backward checking.
1623 // For all cases above, we need a backward checking to filter out edges that
1624 // are not 'strongly' biased.
1625 // BB Pred
1626 // \ /
1627 // Succ
1628 // We select edge BB->Succ if
1629 // freq(BB->Succ) > freq(Succ) * HotProb
1630 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1631 // HotProb
1632 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1633 // Case 1 is covered too, because the first equation reduces to:
1634 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1635 BlockFrequency PredEdgeFreq =
1636 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1637 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1638 BadCFGConflict = true;
1639 break;
1640 }
1641 }
1642
1643 if (BadCFGConflict) {
1644 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1645 << SuccProb << " (prob) (non-cold CFG conflict)\n");
1646 return true;
1647 }
1648
1649 return false;
1650}
1651
1652/// Select the best successor for a block.
1653///
1654/// This looks across all successors of a particular block and attempts to
1655/// select the "best" one to be the layout successor. It only considers direct
1656/// successors which also pass the block filter. It will attempt to avoid
1657/// breaking CFG structure, but cave and break such structures in the case of
1658/// very hot successor edges.
1659///
1660/// \returns The best successor block found, or null if none are viable, along
1661/// with a boolean indicating if tail duplication is necessary.
1662MachineBlockPlacement::BlockAndTailDupResult
1663MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
1664 const BlockChain &Chain,
1665 const BlockFilterSet *BlockFilter) {
1666 const BranchProbability HotProb(StaticLikelyProb, 100);
1667
1668 BlockAndTailDupResult BestSucc = {nullptr, false};
1669 auto BestProb = BranchProbability::getZero();
1670
1671 SmallVector<MachineBasicBlock *, 4> Successors;
1672 auto AdjustedSumProb =
1673 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1674
1675 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1676 << "\n");
1677
1678 // if we already precomputed the best successor for BB, return that if still
1679 // applicable.
1680 auto FoundEdge = ComputedEdges.find(BB);
1681 if (FoundEdge != ComputedEdges.end()) {
1682 MachineBasicBlock *Succ = FoundEdge->second.BB;
1683 ComputedEdges.erase(FoundEdge);
1684 BlockChain *SuccChain = BlockToChain[Succ];
1685 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1686 SuccChain != &Chain && Succ == *SuccChain->begin())
1687 return FoundEdge->second;
1688 }
1689
1690 // if BB is part of a trellis, Use the trellis to determine the optimal
1691 // fallthrough edges
1692 if (isTrellis(BB, Successors, Chain, BlockFilter))
1693 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1694 BlockFilter);
1695
1696 // For blocks with CFG violations, we may be able to lay them out anyway with
1697 // tail-duplication. We keep this vector so we can perform the probability
1698 // calculations the minimum number of times.
1700 DupCandidates;
1701 for (MachineBasicBlock *Succ : Successors) {
1702 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1703 BranchProbability SuccProb =
1704 getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1705
1706 BlockChain &SuccChain = *BlockToChain[Succ];
1707 // Skip the edge \c BB->Succ if block \c Succ has a better layout
1708 // predecessor that yields lower global cost.
1709 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1710 Chain, BlockFilter)) {
1711 // If tail duplication would make Succ profitable, place it.
1712 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ))
1713 DupCandidates.emplace_back(SuccProb, Succ);
1714 continue;
1715 }
1716
1717 LLVM_DEBUG(
1718 dbgs() << " Candidate: " << getBlockName(Succ)
1719 << ", probability: " << SuccProb
1720 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1721 << "\n");
1722
1723 if (BestSucc.BB && BestProb >= SuccProb) {
1724 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1725 continue;
1726 }
1727
1728 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1729 BestSucc.BB = Succ;
1730 BestProb = SuccProb;
1731 }
1732 // Handle the tail duplication candidates in order of decreasing probability.
1733 // Stop at the first one that is profitable. Also stop if they are less
1734 // profitable than BestSucc. Position is important because we preserve it and
1735 // prefer first best match. Here we aren't comparing in order, so we capture
1736 // the position instead.
1737 llvm::stable_sort(DupCandidates,
1738 [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1739 std::tuple<BranchProbability, MachineBasicBlock *> R) {
1740 return std::get<0>(L) > std::get<0>(R);
1741 });
1742 for (auto &Tup : DupCandidates) {
1743 BranchProbability DupProb;
1744 MachineBasicBlock *Succ;
1745 std::tie(DupProb, Succ) = Tup;
1746 if (DupProb < BestProb)
1747 break;
1748 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
1749 (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1750 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1751 << ", probability: " << DupProb
1752 << " (Tail Duplicate)\n");
1753 BestSucc.BB = Succ;
1754 BestSucc.ShouldTailDup = true;
1755 break;
1756 }
1757 }
1758
1759 if (BestSucc.BB)
1760 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1761
1762 return BestSucc;
1763}
1764
1765/// Select the best block from a worklist.
1766///
1767/// This looks through the provided worklist as a list of candidate basic
1768/// blocks and select the most profitable one to place. The definition of
1769/// profitable only really makes sense in the context of a loop. This returns
1770/// the most frequently visited block in the worklist, which in the case of
1771/// a loop, is the one most desirable to be physically close to the rest of the
1772/// loop body in order to improve i-cache behavior.
1773///
1774/// \returns The best block found, or null if none are viable.
1775MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1776 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1777 // Once we need to walk the worklist looking for a candidate, cleanup the
1778 // worklist of already placed entries.
1779 // FIXME: If this shows up on profiles, it could be folded (at the cost of
1780 // some code complexity) into the loop below.
1781 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1782 return BlockToChain.lookup(BB) == &Chain;
1783 });
1784
1785 if (WorkList.empty())
1786 return nullptr;
1787
1788 bool IsEHPad = WorkList[0]->isEHPad();
1789
1790 MachineBasicBlock *BestBlock = nullptr;
1791 BlockFrequency BestFreq;
1792 for (MachineBasicBlock *MBB : WorkList) {
1793 assert(MBB->isEHPad() == IsEHPad &&
1794 "EHPad mismatch between block and work list.");
1795
1796 BlockChain &SuccChain = *BlockToChain[MBB];
1797 if (&SuccChain == &Chain)
1798 continue;
1799
1800 assert(SuccChain.UnscheduledPredecessors == 0 &&
1801 "Found CFG-violating block");
1802
1803 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1804 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1805 << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1806 << " (freq)\n");
1807
1808 // For ehpad, we layout the least probable first as to avoid jumping back
1809 // from least probable landingpads to more probable ones.
1810 //
1811 // FIXME: Using probability is probably (!) not the best way to achieve
1812 // this. We should probably have a more principled approach to layout
1813 // cleanup code.
1814 //
1815 // The goal is to get:
1816 //
1817 // +--------------------------+
1818 // | V
1819 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1820 //
1821 // Rather than:
1822 //
1823 // +-------------------------------------+
1824 // V |
1825 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1826 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1827 continue;
1828
1829 BestBlock = MBB;
1830 BestFreq = CandidateFreq;
1831 }
1832
1833 return BestBlock;
1834}
1835
1836/// Retrieve the first unplaced basic block in the entire function.
1837///
1838/// This routine is called when we are unable to use the CFG to walk through
1839/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1840/// We walk through the function's blocks in order, starting from the
1841/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1842/// re-scanning the entire sequence on repeated calls to this routine.
1843MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1844 const BlockChain &PlacedChain,
1845 MachineFunction::iterator &PrevUnplacedBlockIt) {
1846
1847 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1848 ++I) {
1849 if (BlockChain *Chain = BlockToChain[&*I]; Chain != &PlacedChain) {
1850 PrevUnplacedBlockIt = I;
1851 // Now select the head of the chain to which the unplaced block belongs
1852 // as the block to place. This will force the entire chain to be placed,
1853 // and satisfies the requirements of merging chains.
1854 return *Chain->begin();
1855 }
1856 }
1857 return nullptr;
1858}
1859
1860/// Retrieve the first unplaced basic block among the blocks in BlockFilter.
1861///
1862/// This is similar to getFirstUnplacedBlock for the entire function, but since
1863/// the size of BlockFilter is typically far less than the number of blocks in
1864/// the entire function, iterating through the BlockFilter is more efficient.
1865/// When processing the entire funciton, using the version without BlockFilter
1866/// has a complexity of #(loops in function) * #(blocks in function), while this
1867/// version has a complexity of sum(#(loops in block) foreach block in function)
1868/// which is always smaller. For long function mostly sequential in structure,
1869/// the complexity is amortized to 1 * #(blocks in function).
1870MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1871 const BlockChain &PlacedChain,
1872 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
1873 const BlockFilterSet *BlockFilter) {
1874 assert(BlockFilter);
1875 for (; PrevUnplacedBlockInFilterIt != BlockFilter->end();
1876 ++PrevUnplacedBlockInFilterIt) {
1877 BlockChain *C = BlockToChain[*PrevUnplacedBlockInFilterIt];
1878 if (C != &PlacedChain) {
1879 return *C->begin();
1880 }
1881 }
1882 return nullptr;
1883}
1884
1885void MachineBlockPlacement::fillWorkLists(
1886 const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1887 const BlockFilterSet *BlockFilter = nullptr) {
1888 BlockChain &Chain = *BlockToChain[MBB];
1889 if (!UpdatedPreds.insert(&Chain).second)
1890 return;
1891
1892 assert(
1893 Chain.UnscheduledPredecessors == 0 &&
1894 "Attempting to place block with unscheduled predecessors in worklist.");
1895 for (MachineBasicBlock *ChainBB : Chain) {
1896 assert(BlockToChain[ChainBB] == &Chain &&
1897 "Block in chain doesn't match BlockToChain map.");
1898 for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1899 if (BlockFilter && !BlockFilter->count(Pred))
1900 continue;
1901 if (BlockToChain[Pred] == &Chain)
1902 continue;
1903 ++Chain.UnscheduledPredecessors;
1904 }
1905 }
1906
1907 if (Chain.UnscheduledPredecessors != 0)
1908 return;
1909
1910 MachineBasicBlock *BB = *Chain.begin();
1911 if (BB->isEHPad())
1912 EHPadWorkList.push_back(BB);
1913 else
1914 BlockWorkList.push_back(BB);
1915}
1916
1917void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB,
1918 BlockChain &Chain,
1919 BlockFilterSet *BlockFilter) {
1920 assert(HeadBB && "BB must not be null.\n");
1921 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1922 MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1923 BlockFilterSet::iterator PrevUnplacedBlockInFilterIt;
1924 if (BlockFilter)
1925 PrevUnplacedBlockInFilterIt = BlockFilter->begin();
1926
1927 const MachineBasicBlock *LoopHeaderBB = HeadBB;
1928 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1929 MachineBasicBlock *BB = *std::prev(Chain.end());
1930 while (true) {
1931 assert(BB && "null block found at end of chain in loop.");
1932 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1933 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1934
1935 // Look for the best viable successor if there is one to place immediately
1936 // after this block.
1937 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1938 MachineBasicBlock *BestSucc = Result.BB;
1939 bool ShouldTailDup = Result.ShouldTailDup;
1940 if (allowTailDupPlacement(*F))
1941 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(
1942 BB, BestSucc, Chain, BlockFilter));
1943
1944 // If an immediate successor isn't available, look for the best viable
1945 // block among those we've identified as not violating the loop's CFG at
1946 // this point. This won't be a fallthrough, but it will increase locality.
1947 if (!BestSucc)
1948 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1949 if (!BestSucc)
1950 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1951
1952 if (!BestSucc) {
1953 if (BlockFilter)
1954 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockInFilterIt,
1955 BlockFilter);
1956 else
1957 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt);
1958 if (!BestSucc)
1959 break;
1960
1961 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1962 "layout successor until the CFG reduces\n");
1963 }
1964
1965 // Placement may have changed tail duplication opportunities.
1966 // Check for that now.
1967 if (allowTailDupPlacement(*F) && BestSucc && ShouldTailDup) {
1968 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1969 BlockFilter, PrevUnplacedBlockIt,
1970 PrevUnplacedBlockInFilterIt);
1971 // If the chosen successor was duplicated into BB, don't bother laying
1972 // it out, just go round the loop again with BB as the chain end.
1973 if (!BB->isSuccessor(BestSucc))
1974 continue;
1975 }
1976
1977 // Place this block, updating the datastructures to reflect its placement.
1978 BlockChain &SuccChain = *BlockToChain[BestSucc];
1979 // Zero out UnscheduledPredecessors for the successor we're about to merge
1980 // in case we selected a successor that didn't fit naturally into the CFG.
1981 SuccChain.UnscheduledPredecessors = 0;
1982 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1983 << getBlockName(BestSucc) << "\n");
1984 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1985 Chain.merge(BestSucc, &SuccChain);
1986 BB = *std::prev(Chain.end());
1987 }
1988
1989 LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1990 << getBlockName(*Chain.begin()) << "\n");
1991}
1992
1993// If bottom of block BB has only one successor OldTop, in most cases it is
1994// profitable to move it before OldTop, except the following case:
1995//
1996// -->OldTop<-
1997// | . |
1998// | . |
1999// | . |
2000// ---Pred |
2001// | |
2002// BB-----
2003//
2004// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
2005// layout the other successor below it, so it can't reduce taken branch.
2006// In this case we keep its original layout.
2007bool MachineBlockPlacement::canMoveBottomBlockToTop(
2008 const MachineBasicBlock *BottomBlock, const MachineBasicBlock *OldTop) {
2009 if (BottomBlock->pred_size() != 1)
2010 return true;
2011 MachineBasicBlock *Pred = *BottomBlock->pred_begin();
2012 if (Pred->succ_size() != 2)
2013 return true;
2014
2015 MachineBasicBlock *OtherBB = *Pred->succ_begin();
2016 if (OtherBB == BottomBlock)
2017 OtherBB = *Pred->succ_rbegin();
2018 if (OtherBB == OldTop)
2019 return false;
2020
2021 return true;
2022}
2023
2024// Find out the possible fall through frequence to the top of a loop.
2025BlockFrequency
2026MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top,
2027 const BlockFilterSet &LoopBlockSet) {
2028 BlockFrequency MaxFreq = BlockFrequency(0);
2029 for (MachineBasicBlock *Pred : Top->predecessors()) {
2030 BlockChain *PredChain = BlockToChain[Pred];
2031 if (!LoopBlockSet.count(Pred) &&
2032 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2033 // Found a Pred block can be placed before Top.
2034 // Check if Top is the best successor of Pred.
2035 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2036 bool TopOK = true;
2037 for (MachineBasicBlock *Succ : Pred->successors()) {
2038 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2039 BlockChain *SuccChain = BlockToChain[Succ];
2040 // Check if Succ can be placed after Pred.
2041 // Succ should not be in any chain, or it is the head of some chain.
2042 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
2043 (!SuccChain || Succ == *SuccChain->begin())) {
2044 TopOK = false;
2045 break;
2046 }
2047 }
2048 if (TopOK) {
2049 BlockFrequency EdgeFreq =
2050 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Top);
2051 if (EdgeFreq > MaxFreq)
2052 MaxFreq = EdgeFreq;
2053 }
2054 }
2055 }
2056 return MaxFreq;
2057}
2058
2059// Compute the fall through gains when move NewTop before OldTop.
2060//
2061// In following diagram, edges marked as "-" are reduced fallthrough, edges
2062// marked as "+" are increased fallthrough, this function computes
2063//
2064// SUM(increased fallthrough) - SUM(decreased fallthrough)
2065//
2066// |
2067// | -
2068// V
2069// --->OldTop
2070// | .
2071// | .
2072// +| . +
2073// | Pred --->
2074// | |-
2075// | V
2076// --- NewTop <---
2077// |-
2078// V
2079//
2080BlockFrequency MachineBlockPlacement::FallThroughGains(
2081 const MachineBasicBlock *NewTop, const MachineBasicBlock *OldTop,
2082 const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet) {
2083 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
2084 BlockFrequency FallThrough2Exit = BlockFrequency(0);
2085 if (ExitBB)
2086 FallThrough2Exit =
2087 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, ExitBB);
2088 BlockFrequency BackEdgeFreq =
2089 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, OldTop);
2090
2091 // Find the best Pred of NewTop.
2092 MachineBasicBlock *BestPred = nullptr;
2093 BlockFrequency FallThroughFromPred = BlockFrequency(0);
2094 for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2095 if (!LoopBlockSet.count(Pred))
2096 continue;
2097 BlockChain *PredChain = BlockToChain[Pred];
2098 if (!PredChain || Pred == *std::prev(PredChain->end())) {
2099 BlockFrequency EdgeFreq =
2100 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop);
2101 if (EdgeFreq > FallThroughFromPred) {
2102 FallThroughFromPred = EdgeFreq;
2103 BestPred = Pred;
2104 }
2105 }
2106 }
2107
2108 // If NewTop is not placed after Pred, another successor can be placed
2109 // after Pred.
2110 BlockFrequency NewFreq = BlockFrequency(0);
2111 if (BestPred) {
2112 for (MachineBasicBlock *Succ : BestPred->successors()) {
2113 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2114 continue;
2115 if (ComputedEdges.contains(Succ))
2116 continue;
2117 BlockChain *SuccChain = BlockToChain[Succ];
2118 if ((SuccChain && (Succ != *SuccChain->begin())) ||
2119 (SuccChain == BlockToChain[BestPred]))
2120 continue;
2121 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2122 MBPI->getEdgeProbability(BestPred, Succ);
2123 if (EdgeFreq > NewFreq)
2124 NewFreq = EdgeFreq;
2125 }
2126 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2127 MBPI->getEdgeProbability(BestPred, NewTop);
2128 if (NewFreq > OrigEdgeFreq) {
2129 // If NewTop is not the best successor of Pred, then Pred doesn't
2130 // fallthrough to NewTop. So there is no FallThroughFromPred and
2131 // NewFreq.
2132 NewFreq = BlockFrequency(0);
2133 FallThroughFromPred = BlockFrequency(0);
2134 }
2135 }
2136
2137 BlockFrequency Result = BlockFrequency(0);
2138 BlockFrequency Gains = BackEdgeFreq + NewFreq;
2139 BlockFrequency Lost =
2140 FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2141 if (Gains > Lost)
2142 Result = Gains - Lost;
2143 return Result;
2144}
2145
2146/// Helper function of findBestLoopTop. Find the best loop top block
2147/// from predecessors of old top.
2148///
2149/// Look for a block which is strictly better than the old top for laying
2150/// out before the old top of the loop. This looks for only two patterns:
2151///
2152/// 1. a block has only one successor, the old loop top
2153///
2154/// Because such a block will always result in an unconditional jump,
2155/// rotating it in front of the old top is always profitable.
2156///
2157/// 2. a block has two successors, one is old top, another is exit
2158/// and it has more than one predecessors
2159///
2160/// If it is below one of its predecessors P, only P can fall through to
2161/// it, all other predecessors need a jump to it, and another conditional
2162/// jump to loop header. If it is moved before loop header, all its
2163/// predecessors jump to it, then fall through to loop header. So all its
2164/// predecessors except P can reduce one taken branch.
2165/// At the same time, move it before old top increases the taken branch
2166/// to loop exit block, so the reduced taken branch will be compared with
2167/// the increased taken branch to the loop exit block.
2168MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper(
2169 MachineBasicBlock *OldTop, const MachineLoop &L,
2170 const BlockFilterSet &LoopBlockSet) {
2171 // Check that the header hasn't been fused with a preheader block due to
2172 // crazy branches. If it has, we need to start with the header at the top to
2173 // prevent pulling the preheader into the loop body.
2174 BlockChain &HeaderChain = *BlockToChain[OldTop];
2175 if (!LoopBlockSet.count(*HeaderChain.begin()))
2176 return OldTop;
2177 if (OldTop != *HeaderChain.begin())
2178 return OldTop;
2179
2180 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2181 << "\n");
2182
2183 BlockFrequency BestGains = BlockFrequency(0);
2184 MachineBasicBlock *BestPred = nullptr;
2185 for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2186 if (!LoopBlockSet.count(Pred))
2187 continue;
2188 if (Pred == L.getHeader())
2189 continue;
2190 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2191 << Pred->succ_size() << " successors, "
2192 << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2193 if (Pred->succ_size() > 2)
2194 continue;
2195
2196 MachineBasicBlock *OtherBB = nullptr;
2197 if (Pred->succ_size() == 2) {
2198 OtherBB = *Pred->succ_begin();
2199 if (OtherBB == OldTop)
2200 OtherBB = *Pred->succ_rbegin();
2201 }
2202
2203 if (!canMoveBottomBlockToTop(Pred, OldTop))
2204 continue;
2205
2206 BlockFrequency Gains =
2207 FallThroughGains(Pred, OldTop, OtherBB, LoopBlockSet);
2208 if ((Gains > BlockFrequency(0)) &&
2209 (Gains > BestGains ||
2210 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2211 BestPred = Pred;
2212 BestGains = Gains;
2213 }
2214 }
2215
2216 // If no direct predecessor is fine, just use the loop header.
2217 if (!BestPred) {
2218 LLVM_DEBUG(dbgs() << " final top unchanged\n");
2219 return OldTop;
2220 }
2221
2222 // Walk backwards through any straight line of predecessors.
2223 while (BestPred->pred_size() == 1 &&
2224 (*BestPred->pred_begin())->succ_size() == 1 &&
2225 *BestPred->pred_begin() != L.getHeader())
2226 BestPred = *BestPred->pred_begin();
2227
2228 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2229 return BestPred;
2230}
2231
2232/// Find the best loop top block for layout.
2233///
2234/// This function iteratively calls findBestLoopTopHelper, until no new better
2235/// BB can be found.
2236MachineBasicBlock *
2237MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2238 const BlockFilterSet &LoopBlockSet) {
2239 // Placing the latch block before the header may introduce an extra branch
2240 // that skips this block the first time the loop is executed, which we want
2241 // to avoid when optimising for size.
2242 // FIXME: in theory there is a case that does not introduce a new branch,
2243 // i.e. when the layout predecessor does not fallthrough to the loop header.
2244 // In practice this never happens though: there always seems to be a preheader
2245 // that can fallthrough and that is also placed before the header.
2246 if (llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get()))
2247 return L.getHeader();
2248
2249 MachineBasicBlock *OldTop = nullptr;
2250 MachineBasicBlock *NewTop = L.getHeader();
2251 while (NewTop != OldTop) {
2252 OldTop = NewTop;
2253 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2254 if (NewTop != OldTop)
2255 ComputedEdges[NewTop] = {OldTop, false};
2256 }
2257 return NewTop;
2258}
2259
2260/// Find the best loop exiting block for layout.
2261///
2262/// This routine implements the logic to analyze the loop looking for the best
2263/// block to layout at the top of the loop. Typically this is done to maximize
2264/// fallthrough opportunities.
2265MachineBasicBlock *
2266MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2267 const BlockFilterSet &LoopBlockSet,
2268 BlockFrequency &ExitFreq) {
2269 // We don't want to layout the loop linearly in all cases. If the loop header
2270 // is just a normal basic block in the loop, we want to look for what block
2271 // within the loop is the best one to layout at the top. However, if the loop
2272 // header has be pre-merged into a chain due to predecessors not having
2273 // analyzable branches, *and* the predecessor it is merged with is *not* part
2274 // of the loop, rotating the header into the middle of the loop will create
2275 // a non-contiguous range of blocks which is Very Bad. So start with the
2276 // header and only rotate if safe.
2277 BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2278 if (!LoopBlockSet.count(*HeaderChain.begin()))
2279 return nullptr;
2280
2281 BlockFrequency BestExitEdgeFreq;
2282 unsigned BestExitLoopDepth = 0;
2283 MachineBasicBlock *ExitingBB = nullptr;
2284 // If there are exits to outer loops, loop rotation can severely limit
2285 // fallthrough opportunities unless it selects such an exit. Keep a set of
2286 // blocks where rotating to exit with that block will reach an outer loop.
2287 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2288
2289 LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2290 << getBlockName(L.getHeader()) << "\n");
2291 for (MachineBasicBlock *MBB : L.getBlocks()) {
2292 BlockChain &Chain = *BlockToChain[MBB];
2293 // Ensure that this block is at the end of a chain; otherwise it could be
2294 // mid-way through an inner loop or a successor of an unanalyzable branch.
2295 if (MBB != *std::prev(Chain.end()))
2296 continue;
2297
2298 // Now walk the successors. We need to establish whether this has a viable
2299 // exiting successor and whether it has a viable non-exiting successor.
2300 // We store the old exiting state and restore it if a viable looping
2301 // successor isn't found.
2302 MachineBasicBlock *OldExitingBB = ExitingBB;
2303 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2304 bool HasLoopingSucc = false;
2305 for (MachineBasicBlock *Succ : MBB->successors()) {
2306 if (Succ->isEHPad())
2307 continue;
2308 if (Succ == MBB)
2309 continue;
2310 BlockChain &SuccChain = *BlockToChain[Succ];
2311 // Don't split chains, either this chain or the successor's chain.
2312 if (&Chain == &SuccChain) {
2313 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2314 << getBlockName(Succ) << " (chain conflict)\n");
2315 continue;
2316 }
2317
2318 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2319 if (LoopBlockSet.count(Succ)) {
2320 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2321 << getBlockName(Succ) << " (" << SuccProb << ")\n");
2322 HasLoopingSucc = true;
2323 continue;
2324 }
2325
2326 unsigned SuccLoopDepth = 0;
2327 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2328 SuccLoopDepth = ExitLoop->getLoopDepth();
2329 if (ExitLoop->contains(&L))
2330 BlocksExitingToOuterLoop.insert(MBB);
2331 }
2332
2333 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2334 LLVM_DEBUG(
2335 dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2336 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2337 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2338 // Note that we bias this toward an existing layout successor to retain
2339 // incoming order in the absence of better information. The exit must have
2340 // a frequency higher than the current exit before we consider breaking
2341 // the layout.
2342 BranchProbability Bias(100 - ExitBlockBias, 100);
2343 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2344 ExitEdgeFreq > BestExitEdgeFreq ||
2345 (MBB->isLayoutSuccessor(Succ) &&
2346 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2347 BestExitEdgeFreq = ExitEdgeFreq;
2348 ExitingBB = MBB;
2349 }
2350 }
2351
2352 if (!HasLoopingSucc) {
2353 // Restore the old exiting state, no viable looping successor was found.
2354 ExitingBB = OldExitingBB;
2355 BestExitEdgeFreq = OldBestExitEdgeFreq;
2356 }
2357 }
2358 // Without a candidate exiting block or with only a single block in the
2359 // loop, just use the loop header to layout the loop.
2360 if (!ExitingBB) {
2361 LLVM_DEBUG(
2362 dbgs() << " No other candidate exit blocks, using loop header\n");
2363 return nullptr;
2364 }
2365 if (L.getNumBlocks() == 1) {
2366 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2367 return nullptr;
2368 }
2369
2370 // Also, if we have exit blocks which lead to outer loops but didn't select
2371 // one of them as the exiting block we are rotating toward, disable loop
2372 // rotation altogether.
2373 if (!BlocksExitingToOuterLoop.empty() &&
2374 !BlocksExitingToOuterLoop.count(ExitingBB))
2375 return nullptr;
2376
2377 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2378 << "\n");
2379 ExitFreq = BestExitEdgeFreq;
2380 return ExitingBB;
2381}
2382
2383/// Check if there is a fallthrough to loop header Top.
2384///
2385/// 1. Look for a Pred that can be layout before Top.
2386/// 2. Check if Top is the most possible successor of Pred.
2387bool MachineBlockPlacement::hasViableTopFallthrough(
2388 const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) {
2389 for (MachineBasicBlock *Pred : Top->predecessors()) {
2390 BlockChain *PredChain = BlockToChain[Pred];
2391 if (!LoopBlockSet.count(Pred) &&
2392 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2393 // Found a Pred block can be placed before Top.
2394 // Check if Top is the best successor of Pred.
2395 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2396 bool TopOK = true;
2397 for (MachineBasicBlock *Succ : Pred->successors()) {
2398 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2399 BlockChain *SuccChain = BlockToChain[Succ];
2400 // Check if Succ can be placed after Pred.
2401 // Succ should not be in any chain, or it is the head of some chain.
2402 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2403 TopOK = false;
2404 break;
2405 }
2406 }
2407 if (TopOK)
2408 return true;
2409 }
2410 }
2411 return false;
2412}
2413
2414/// Attempt to rotate an exiting block to the bottom of the loop.
2415///
2416/// Once we have built a chain, try to rotate it to line up the hot exit block
2417/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2418/// branches. For example, if the loop has fallthrough into its header and out
2419/// of its bottom already, don't rotate it.
2420void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2421 const MachineBasicBlock *ExitingBB,
2422 BlockFrequency ExitFreq,
2423 const BlockFilterSet &LoopBlockSet) {
2424 if (!ExitingBB)
2425 return;
2426
2427 MachineBasicBlock *Top = *LoopChain.begin();
2428 MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2429
2430 // If ExitingBB is already the last one in a chain then nothing to do.
2431 if (Bottom == ExitingBB)
2432 return;
2433
2434 // The entry block should always be the first BB in a function.
2435 if (Top->isEntryBlock())
2436 return;
2437
2438 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2439
2440 // If the header has viable fallthrough, check whether the current loop
2441 // bottom is a viable exiting block. If so, bail out as rotating will
2442 // introduce an unnecessary branch.
2443 if (ViableTopFallthrough) {
2444 for (MachineBasicBlock *Succ : Bottom->successors()) {
2445 BlockChain *SuccChain = BlockToChain[Succ];
2446 if (!LoopBlockSet.count(Succ) &&
2447 (!SuccChain || Succ == *SuccChain->begin()))
2448 return;
2449 }
2450
2451 // Rotate will destroy the top fallthrough, we need to ensure the new exit
2452 // frequency is larger than top fallthrough.
2453 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2454 if (FallThrough2Top >= ExitFreq)
2455 return;
2456 }
2457
2458 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2459 if (ExitIt == LoopChain.end())
2460 return;
2461
2462 // Rotating a loop exit to the bottom when there is a fallthrough to top
2463 // trades the entry fallthrough for an exit fallthrough.
2464 // If there is no bottom->top edge, but the chosen exit block does have
2465 // a fallthrough, we break that fallthrough for nothing in return.
2466
2467 // Let's consider an example. We have a built chain of basic blocks
2468 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2469 // By doing a rotation we get
2470 // Bk+1, ..., Bn, B1, ..., Bk
2471 // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2472 // If we had a fallthrough Bk -> Bk+1 it is broken now.
2473 // It might be compensated by fallthrough Bn -> B1.
2474 // So we have a condition to avoid creation of extra branch by loop rotation.
2475 // All below must be true to avoid loop rotation:
2476 // If there is a fallthrough to top (B1)
2477 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2478 // There is no fallthrough from bottom (Bn) to top (B1).
2479 // Please note that there is no exit fallthrough from Bn because we checked it
2480 // above.
2481 if (ViableTopFallthrough) {
2482 assert(std::next(ExitIt) != LoopChain.end() &&
2483 "Exit should not be last BB");
2484 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2485 if (ExitingBB->isSuccessor(NextBlockInChain))
2486 if (!Bottom->isSuccessor(Top))
2487 return;
2488 }
2489
2490 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2491 << " at bottom\n");
2492 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2493}
2494
2495/// Attempt to rotate a loop based on profile data to reduce branch cost.
2496///
2497/// With profile data, we can determine the cost in terms of missed fall through
2498/// opportunities when rotating a loop chain and select the best rotation.
2499/// Basically, there are three kinds of cost to consider for each rotation:
2500/// 1. The possibly missed fall through edge (if it exists) from BB out of
2501/// the loop to the loop header.
2502/// 2. The possibly missed fall through edges (if they exist) from the loop
2503/// exits to BB out of the loop.
2504/// 3. The missed fall through edge (if it exists) from the last BB to the
2505/// first BB in the loop chain.
2506/// Therefore, the cost for a given rotation is the sum of costs listed above.
2507/// We select the best rotation with the smallest cost.
2508void MachineBlockPlacement::rotateLoopWithProfile(
2509 BlockChain &LoopChain, const MachineLoop &L,
2510 const BlockFilterSet &LoopBlockSet) {
2511 auto RotationPos = LoopChain.end();
2512 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2513
2514 // The entry block should always be the first BB in a function.
2515 if (ChainHeaderBB->isEntryBlock())
2516 return;
2517
2518 BlockFrequency SmallestRotationCost = BlockFrequency::max();
2519
2520 // A utility lambda that scales up a block frequency by dividing it by a
2521 // branch probability which is the reciprocal of the scale.
2522 auto ScaleBlockFrequency = [](BlockFrequency Freq,
2523 unsigned Scale) -> BlockFrequency {
2524 if (Scale == 0)
2525 return BlockFrequency(0);
2526 // Use operator / between BlockFrequency and BranchProbability to implement
2527 // saturating multiplication.
2528 return Freq / BranchProbability(1, Scale);
2529 };
2530
2531 // Compute the cost of the missed fall-through edge to the loop header if the
2532 // chain head is not the loop header. As we only consider natural loops with
2533 // single header, this computation can be done only once.
2534 BlockFrequency HeaderFallThroughCost(0);
2535 for (auto *Pred : ChainHeaderBB->predecessors()) {
2536 BlockChain *PredChain = BlockToChain[Pred];
2537 if (!LoopBlockSet.count(Pred) &&
2538 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2539 auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2540 MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2541 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2542 // If the predecessor has only an unconditional jump to the header, we
2543 // need to consider the cost of this jump.
2544 if (Pred->succ_size() == 1)
2545 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2546 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2547 }
2548 }
2549
2550 // Here we collect all exit blocks in the loop, and for each exit we find out
2551 // its hottest exit edge. For each loop rotation, we define the loop exit cost
2552 // as the sum of frequencies of exit edges we collect here, excluding the exit
2553 // edge from the tail of the loop chain.
2555 for (auto *BB : LoopChain) {
2556 auto LargestExitEdgeProb = BranchProbability::getZero();
2557 for (auto *Succ : BB->successors()) {
2558 BlockChain *SuccChain = BlockToChain[Succ];
2559 if (!LoopBlockSet.count(Succ) &&
2560 (!SuccChain || Succ == *SuccChain->begin())) {
2561 auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2562 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2563 }
2564 }
2565 if (LargestExitEdgeProb > BranchProbability::getZero()) {
2566 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2567 ExitsWithFreq.emplace_back(BB, ExitFreq);
2568 }
2569 }
2570
2571 // In this loop we iterate every block in the loop chain and calculate the
2572 // cost assuming the block is the head of the loop chain. When the loop ends,
2573 // we should have found the best candidate as the loop chain's head.
2574 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2575 EndIter = LoopChain.end();
2576 Iter != EndIter; Iter++, TailIter++) {
2577 // TailIter is used to track the tail of the loop chain if the block we are
2578 // checking (pointed by Iter) is the head of the chain.
2579 if (TailIter == LoopChain.end())
2580 TailIter = LoopChain.begin();
2581
2582 auto TailBB = *TailIter;
2583
2584 // Calculate the cost by putting this BB to the top.
2585 BlockFrequency Cost = BlockFrequency(0);
2586
2587 // If the current BB is the loop header, we need to take into account the
2588 // cost of the missed fall through edge from outside of the loop to the
2589 // header.
2590 if (Iter != LoopChain.begin())
2591 Cost += HeaderFallThroughCost;
2592
2593 // Collect the loop exit cost by summing up frequencies of all exit edges
2594 // except the one from the chain tail.
2595 for (auto &ExitWithFreq : ExitsWithFreq)
2596 if (TailBB != ExitWithFreq.first)
2597 Cost += ExitWithFreq.second;
2598
2599 // The cost of breaking the once fall-through edge from the tail to the top
2600 // of the loop chain. Here we need to consider three cases:
2601 // 1. If the tail node has only one successor, then we will get an
2602 // additional jmp instruction. So the cost here is (MisfetchCost +
2603 // JumpInstCost) * tail node frequency.
2604 // 2. If the tail node has two successors, then we may still get an
2605 // additional jmp instruction if the layout successor after the loop
2606 // chain is not its CFG successor. Note that the more frequently executed
2607 // jmp instruction will be put ahead of the other one. Assume the
2608 // frequency of those two branches are x and y, where x is the frequency
2609 // of the edge to the chain head, then the cost will be
2610 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2611 // 3. If the tail node has more than two successors (this rarely happens),
2612 // we won't consider any additional cost.
2613 if (TailBB->isSuccessor(*Iter)) {
2614 auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2615 if (TailBB->succ_size() == 1)
2616 Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost);
2617 else if (TailBB->succ_size() == 2) {
2618 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2619 auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2620 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2621 ? TailBBFreq * TailToHeadProb.getCompl()
2622 : TailToHeadFreq;
2623 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2624 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2625 }
2626 }
2627
2628 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2629 << getBlockName(*Iter) << " to the top: "
2630 << printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2631
2632 if (Cost < SmallestRotationCost) {
2633 SmallestRotationCost = Cost;
2634 RotationPos = Iter;
2635 }
2636 }
2637
2638 if (RotationPos != LoopChain.end()) {
2639 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2640 << " to the top\n");
2641 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2642 }
2643}
2644
2645/// Collect blocks in the given loop that are to be placed.
2646///
2647/// When profile data is available, exclude cold blocks from the returned set;
2648/// otherwise, collect all blocks in the loop.
2649MachineBlockPlacement::BlockFilterSet
2650MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2651 // Collect the blocks in a set ordered by block number, as this gives the same
2652 // order as they appear in the function.
2653 struct MBBCompare {
2654 bool operator()(const MachineBasicBlock *X,
2655 const MachineBasicBlock *Y) const {
2656 return X->getNumber() < Y->getNumber();
2657 }
2658 };
2659 std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet;
2660
2661 // Filter cold blocks off from LoopBlockSet when profile data is available.
2662 // Collect the sum of frequencies of incoming edges to the loop header from
2663 // outside. If we treat the loop as a super block, this is the frequency of
2664 // the loop. Then for each block in the loop, we calculate the ratio between
2665 // its frequency and the frequency of the loop block. When it is too small,
2666 // don't add it to the loop chain. If there are outer loops, then this block
2667 // will be merged into the first outer loop chain for which this block is not
2668 // cold anymore. This needs precise profile data and we only do this when
2669 // profile data is available.
2670 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2671 BlockFrequency LoopFreq(0);
2672 for (auto *LoopPred : L.getHeader()->predecessors())
2673 if (!L.contains(LoopPred))
2674 LoopFreq += MBFI->getBlockFreq(LoopPred) *
2675 MBPI->getEdgeProbability(LoopPred, L.getHeader());
2676
2677 for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2678 if (LoopBlockSet.count(LoopBB))
2679 continue;
2680 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2681 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2682 continue;
2683 BlockChain *Chain = BlockToChain[LoopBB];
2684 for (MachineBasicBlock *ChainBB : *Chain)
2685 LoopBlockSet.insert(ChainBB);
2686 }
2687 } else
2688 LoopBlockSet.insert(L.block_begin(), L.block_end());
2689
2690 // Copy the blocks into a BlockFilterSet, as iterating it is faster than
2691 // std::set. We will only remove blocks and never insert them, which will
2692 // preserve the ordering.
2693 BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end());
2694 return Ret;
2695}
2696
2697/// Forms basic block chains from the natural loop structures.
2698///
2699/// These chains are designed to preserve the existing *structure* of the code
2700/// as much as possible. We can then stitch the chains together in a way which
2701/// both preserves the topological structure and minimizes taken conditional
2702/// branches.
2703void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2704 // First recurse through any nested loops, building chains for those inner
2705 // loops.
2706 for (const MachineLoop *InnerLoop : L)
2707 buildLoopChains(*InnerLoop);
2708
2709 assert(BlockWorkList.empty() &&
2710 "BlockWorkList not empty when starting to build loop chains.");
2711 assert(EHPadWorkList.empty() &&
2712 "EHPadWorkList not empty when starting to build loop chains.");
2713 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2714
2715 // Check if we have profile data for this function. If yes, we will rotate
2716 // this loop by modeling costs more precisely which requires the profile data
2717 // for better layout.
2718 bool RotateLoopWithProfile =
2720 (PreciseRotationCost && F->getFunction().hasProfileData());
2721
2722 // First check to see if there is an obviously preferable top block for the
2723 // loop. This will default to the header, but may end up as one of the
2724 // predecessors to the header if there is one which will result in strictly
2725 // fewer branches in the loop body.
2726 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2727
2728 // If we selected just the header for the loop top, look for a potentially
2729 // profitable exit block in the event that rotating the loop can eliminate
2730 // branches by placing an exit edge at the bottom.
2731 //
2732 // Loops are processed innermost to uttermost, make sure we clear
2733 // PreferredLoopExit before processing a new loop.
2734 PreferredLoopExit = nullptr;
2735 BlockFrequency ExitFreq;
2736 if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2737 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2738
2739 BlockChain &LoopChain = *BlockToChain[LoopTop];
2740
2741 // FIXME: This is a really lame way of walking the chains in the loop: we
2742 // walk the blocks, and use a set to prevent visiting a particular chain
2743 // twice.
2744 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2745 assert(LoopChain.UnscheduledPredecessors == 0 &&
2746 "LoopChain should not have unscheduled predecessors.");
2747 UpdatedPreds.insert(&LoopChain);
2748
2749 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2750 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2751
2752 buildChain(LoopTop, LoopChain, &LoopBlockSet);
2753
2754 if (RotateLoopWithProfile)
2755 rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2756 else
2757 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2758
2759 LLVM_DEBUG({
2760 // Crash at the end so we get all of the debugging output first.
2761 bool BadLoop = false;
2762 if (LoopChain.UnscheduledPredecessors) {
2763 BadLoop = true;
2764 dbgs() << "Loop chain contains a block without its preds placed!\n"
2765 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2766 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2767 }
2768 for (MachineBasicBlock *ChainBB : LoopChain) {
2769 dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2770 if (!LoopBlockSet.remove(ChainBB)) {
2771 // We don't mark the loop as bad here because there are real situations
2772 // where this can occur. For example, with an unanalyzable fallthrough
2773 // from a loop block to a non-loop block or vice versa.
2774 dbgs() << "Loop chain contains a block not contained by the loop!\n"
2775 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2776 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2777 << " Bad block: " << getBlockName(ChainBB) << "\n";
2778 }
2779 }
2780
2781 if (!LoopBlockSet.empty()) {
2782 BadLoop = true;
2783 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2784 dbgs() << "Loop contains blocks never placed into a chain!\n"
2785 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2786 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2787 << " Bad block: " << getBlockName(LoopBB) << "\n";
2788 }
2789 assert(!BadLoop && "Detected problems with the placement of this loop.");
2790 });
2791
2792 BlockWorkList.clear();
2793 EHPadWorkList.clear();
2794}
2795
2796void MachineBlockPlacement::buildCFGChains() {
2797 // Ensure that every BB in the function has an associated chain to simplify
2798 // the assumptions of the remaining algorithm.
2799 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2800 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2801 ++FI) {
2802 MachineBasicBlock *BB = &*FI;
2803 BlockChain *Chain =
2804 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2805 // Also, merge any blocks which we cannot reason about and must preserve
2806 // the exact fallthrough behavior for.
2807 while (true) {
2808 Cond.clear();
2809 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2810 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2811 break;
2812
2813 MachineFunction::iterator NextFI = std::next(FI);
2814 MachineBasicBlock *NextBB = &*NextFI;
2815 // Ensure that the layout successor is a viable block, as we know that
2816 // fallthrough is a possibility.
2817 assert(NextFI != FE && "Can't fallthrough past the last block.");
2818 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2819 << getBlockName(BB) << " -> " << getBlockName(NextBB)
2820 << "\n");
2821 Chain->merge(NextBB, nullptr);
2822#ifndef NDEBUG
2823 BlocksWithUnanalyzableExits.insert(&*BB);
2824#endif
2825 FI = NextFI;
2826 BB = NextBB;
2827 }
2828 }
2829
2830 // Build any loop-based chains.
2831 PreferredLoopExit = nullptr;
2832 for (MachineLoop *L : *MLI)
2833 buildLoopChains(*L);
2834
2835 assert(BlockWorkList.empty() &&
2836 "BlockWorkList should be empty before building final chain.");
2837 assert(EHPadWorkList.empty() &&
2838 "EHPadWorkList should be empty before building final chain.");
2839
2840 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2841 for (MachineBasicBlock &MBB : *F)
2842 fillWorkLists(&MBB, UpdatedPreds);
2843
2844 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2845 buildChain(&F->front(), FunctionChain);
2846
2847#ifndef NDEBUG
2848 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2849#endif
2850 LLVM_DEBUG({
2851 // Crash at the end so we get all of the debugging output first.
2852 bool BadFunc = false;
2853 FunctionBlockSetType FunctionBlockSet;
2854 for (MachineBasicBlock &MBB : *F)
2855 FunctionBlockSet.insert(&MBB);
2856
2857 for (MachineBasicBlock *ChainBB : FunctionChain)
2858 if (!FunctionBlockSet.erase(ChainBB)) {
2859 BadFunc = true;
2860 dbgs() << "Function chain contains a block not in the function!\n"
2861 << " Bad block: " << getBlockName(ChainBB) << "\n";
2862 }
2863
2864 if (!FunctionBlockSet.empty()) {
2865 BadFunc = true;
2866 for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2867 dbgs() << "Function contains blocks never placed into a chain!\n"
2868 << " Bad block: " << getBlockName(RemainingBB) << "\n";
2869 }
2870 assert(!BadFunc && "Detected problems with the block placement.");
2871 });
2872
2873 // Remember original layout ordering, so we can update terminators after
2874 // reordering to point to the original layout successor.
2875 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2876 F->getNumBlockIDs());
2877 {
2878 MachineBasicBlock *LastMBB = nullptr;
2879 for (auto &MBB : *F) {
2880 if (LastMBB != nullptr)
2881 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2882 LastMBB = &MBB;
2883 }
2884 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2885 }
2886
2887 // Splice the blocks into place.
2888 MachineFunction::iterator InsertPos = F->begin();
2889 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2890 for (MachineBasicBlock *ChainBB : FunctionChain) {
2891 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2892 : " ... ")
2893 << getBlockName(ChainBB) << "\n");
2894 if (InsertPos != MachineFunction::iterator(ChainBB))
2895 F->splice(InsertPos, ChainBB);
2896 else
2897 ++InsertPos;
2898
2899 // Update the terminator of the previous block.
2900 if (ChainBB == *FunctionChain.begin())
2901 continue;
2902 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2903
2904 // FIXME: It would be awesome of updateTerminator would just return rather
2905 // than assert when the branch cannot be analyzed in order to remove this
2906 // boiler plate.
2907 Cond.clear();
2908 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2909
2910#ifndef NDEBUG
2911 if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2912 // Given the exact block placement we chose, we may actually not _need_ to
2913 // be able to edit PrevBB's terminator sequence, but not being _able_ to
2914 // do that at this point is a bug.
2915 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2916 !PrevBB->canFallThrough()) &&
2917 "Unexpected block with un-analyzable fallthrough!");
2918 Cond.clear();
2919 TBB = FBB = nullptr;
2920 }
2921#endif
2922
2923 // The "PrevBB" is not yet updated to reflect current code layout, so,
2924 // o. it may fall-through to a block without explicit "goto" instruction
2925 // before layout, and no longer fall-through it after layout; or
2926 // o. just opposite.
2927 //
2928 // analyzeBranch() may return erroneous value for FBB when these two
2929 // situations take place. For the first scenario FBB is mistakenly set NULL;
2930 // for the 2nd scenario, the FBB, which is expected to be NULL, is
2931 // mistakenly pointing to "*BI".
2932 // Thus, if the future change needs to use FBB before the layout is set, it
2933 // has to correct FBB first by using the code similar to the following:
2934 //
2935 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2936 // PrevBB->updateTerminator();
2937 // Cond.clear();
2938 // TBB = FBB = nullptr;
2939 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2940 // // FIXME: This should never take place.
2941 // TBB = FBB = nullptr;
2942 // }
2943 // }
2944 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2945 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2946 }
2947 }
2948
2949 // Fixup the last block.
2950 Cond.clear();
2951 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2952 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2953 MachineBasicBlock *PrevBB = &F->back();
2954 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2955 }
2956
2957 BlockWorkList.clear();
2958 EHPadWorkList.clear();
2959}
2960
2961void MachineBlockPlacement::optimizeBranches() {
2962 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2964
2965 // Now that all the basic blocks in the chain have the proper layout,
2966 // make a final call to analyzeBranch with AllowModify set.
2967 // Indeed, the target may be able to optimize the branches in a way we
2968 // cannot because all branches may not be analyzable.
2969 // E.g., the target may be able to remove an unconditional branch to
2970 // a fallthrough when it occurs after predicated terminators.
2971 for (MachineBasicBlock *ChainBB : FunctionChain) {
2972 Cond.clear();
2973 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2974 if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true))
2975 continue;
2976 if (!TBB || !FBB || Cond.empty())
2977 continue;
2978 // If we are optimizing for size we do not consider the runtime performance.
2979 // Instead, we retain the original branch condition so we have more uniform
2980 // instructions which will benefit ICF.
2981 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()))
2982 continue;
2983 // If ChainBB has a two-way branch, try to re-order the branches
2984 // such that we branch to the successor with higher probability first.
2985 if (MBPI->getEdgeProbability(ChainBB, TBB) >=
2986 MBPI->getEdgeProbability(ChainBB, FBB))
2987 continue;
2989 continue;
2990 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2991 << getBlockName(ChainBB) << "\n");
2992 LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
2993 << "\n");
2994 auto Dl = ChainBB->findBranchDebugLoc();
2995 TII->removeBranch(*ChainBB);
2996 TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl);
2997 }
2998}
2999
3000void MachineBlockPlacement::alignBlocks() {
3001 // Walk through the backedges of the function now that we have fully laid out
3002 // the basic blocks and align the destination of each backedge. We don't rely
3003 // exclusively on the loop info here so that we can align backedges in
3004 // unnatural CFGs and backedges that were introduced purely because of the
3005 // loop rotations done during this layout pass.
3007 if (F->getFunction().hasMinSize() ||
3008 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
3009 return;
3010 }
3011
3012 BlockChain &FunctionChain = *BlockToChain[&F->front()];
3013 // Empty chain.
3014 if (FunctionChain.begin() == FunctionChain.end())
3015 return;
3016
3017 const BranchProbability ColdProb(1, 5); // 20%
3018 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
3019 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
3020 for (MachineBasicBlock *ChainBB : FunctionChain) {
3021 if (ChainBB == *FunctionChain.begin())
3022 continue;
3023
3024 // Don't align non-looping basic blocks. These are unlikely to execute
3025 // enough times to matter in practice. Note that we'll still handle
3026 // unnatural CFGs inside of a natural outer loop (the common case) and
3027 // rotated loops.
3028 MachineLoop *L = MLI->getLoopFor(ChainBB);
3029 if (!L)
3030 continue;
3031
3032 const Align TLIAlign = TLI->getPrefLoopAlignment(L);
3033 unsigned MDAlign = 1;
3034 MDNode *LoopID = L->getLoopID();
3035 if (LoopID) {
3036 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
3037 MDNode *MD = dyn_cast<MDNode>(MDO);
3038 if (MD == nullptr)
3039 continue;
3040 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
3041 if (S == nullptr)
3042 continue;
3043 if (S->getString() == "llvm.loop.align") {
3044 assert(MD->getNumOperands() == 2 &&
3045 "per-loop align metadata should have two operands.");
3046 MDAlign =
3047 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
3048 assert(MDAlign >= 1 && "per-loop align value must be positive.");
3049 }
3050 }
3051 }
3052
3053 // Use max of the TLIAlign and MDAlign
3054 const Align LoopAlign = std::max(TLIAlign, Align(MDAlign));
3055 if (LoopAlign == 1)
3056 continue; // Don't care about loop alignment.
3057
3058 // If the block is cold relative to the function entry don't waste space
3059 // aligning it.
3060 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
3061 if (Freq < WeightedEntryFreq)
3062 continue;
3063
3064 // If the block is cold relative to its loop header, don't align it
3065 // regardless of what edges into the block exist.
3066 MachineBasicBlock *LoopHeader = L->getHeader();
3067 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
3068 if (Freq < (LoopHeaderFreq * ColdProb))
3069 continue;
3070
3071 // If the global profiles indicates so, don't align it.
3072 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
3073 !TLI->alignLoopsWithOptSize())
3074 continue;
3075
3076 // Check for the existence of a non-layout predecessor which would benefit
3077 // from aligning this block.
3078 MachineBasicBlock *LayoutPred =
3079 &*std::prev(MachineFunction::iterator(ChainBB));
3080
3081 auto DetermineMaxAlignmentPadding = [&]() {
3082 // Set the maximum bytes allowed to be emitted for alignment.
3083 unsigned MaxBytes;
3084 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
3086 else
3087 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
3088 ChainBB->setMaxBytesForAlignment(MaxBytes);
3089 };
3090
3091 // Force alignment if all the predecessors are jumps. We already checked
3092 // that the block isn't cold above.
3093 if (!LayoutPred->isSuccessor(ChainBB)) {
3094 ChainBB->setAlignment(LoopAlign);
3095 DetermineMaxAlignmentPadding();
3096 continue;
3097 }
3098
3099 // Align this block if the layout predecessor's edge into this block is
3100 // cold relative to the block. When this is true, other predecessors make up
3101 // all of the hot entries into the block and thus alignment is likely to be
3102 // important.
3103 BranchProbability LayoutProb =
3104 MBPI->getEdgeProbability(LayoutPred, ChainBB);
3105 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
3106 if (LayoutEdgeFreq <= (Freq * ColdProb)) {
3107 ChainBB->setAlignment(LoopAlign);
3108 DetermineMaxAlignmentPadding();
3109 }
3110 }
3111
3112 const bool HasMaxBytesOverride =
3113 MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3114
3115 if (AlignAllBlock)
3116 // Align all of the blocks in the function to a specific alignment.
3117 for (MachineBasicBlock &MBB : *F) {
3118 if (HasMaxBytesOverride)
3121 else
3123 }
3124 else if (AlignAllNonFallThruBlocks) {
3125 // Align all of the blocks that have no fall-through predecessors to a
3126 // specific alignment.
3127 for (auto MBI = std::next(F->begin()), MBE = F->end(); MBI != MBE; ++MBI) {
3128 auto LayoutPred = std::prev(MBI);
3129 if (!LayoutPred->isSuccessor(&*MBI)) {
3130 if (HasMaxBytesOverride)
3131 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3133 else
3134 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3135 }
3136 }
3137 }
3138}
3139
3140/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3141/// it was duplicated into its chain predecessor and removed.
3142/// \p BB - Basic block that may be duplicated.
3143///
3144/// \p LPred - Chosen layout predecessor of \p BB.
3145/// Updated to be the chain end if LPred is removed.
3146/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3147/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3148/// Used to identify which blocks to update predecessor
3149/// counts.
3150/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3151/// chosen in the given order due to unnatural CFG
3152/// only needed if \p BB is removed and
3153/// \p PrevUnplacedBlockIt pointed to \p BB.
3154/// @return true if \p BB was removed.
3155bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3156 MachineBasicBlock *BB, MachineBasicBlock *&LPred,
3157 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
3158 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3159 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) {
3160 bool Removed, DuplicatedToLPred;
3161 bool DuplicatedToOriginalLPred;
3162 Removed = maybeTailDuplicateBlock(
3163 BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3164 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3165 if (!Removed)
3166 return false;
3167 DuplicatedToOriginalLPred = DuplicatedToLPred;
3168 // Iteratively try to duplicate again. It can happen that a block that is
3169 // duplicated into is still small enough to be duplicated again.
3170 // No need to call markBlockSuccessors in this case, as the blocks being
3171 // duplicated from here on are already scheduled.
3172 while (DuplicatedToLPred && Removed) {
3173 MachineBasicBlock *DupBB, *DupPred;
3174 // The removal callback causes Chain.end() to be updated when a block is
3175 // removed. On the first pass through the loop, the chain end should be the
3176 // same as it was on function entry. On subsequent passes, because we are
3177 // duplicating the block at the end of the chain, if it is removed the
3178 // chain will have shrunk by one block.
3179 BlockChain::iterator ChainEnd = Chain.end();
3180 DupBB = *(--ChainEnd);
3181 // Now try to duplicate again.
3182 if (ChainEnd == Chain.begin())
3183 break;
3184 DupPred = *std::prev(ChainEnd);
3185 Removed = maybeTailDuplicateBlock(
3186 DupBB, DupPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3187 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3188 }
3189 // If BB was duplicated into LPred, it is now scheduled. But because it was
3190 // removed, markChainSuccessors won't be called for its chain. Instead we
3191 // call markBlockSuccessors for LPred to achieve the same effect. This must go
3192 // at the end because repeating the tail duplication can increase the number
3193 // of unscheduled predecessors.
3194 LPred = *std::prev(Chain.end());
3195 if (DuplicatedToOriginalLPred)
3196 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3197 return true;
3198}
3199
3200/// Tail duplicate \p BB into (some) predecessors if profitable.
3201/// \p BB - Basic block that may be duplicated
3202/// \p LPred - Chosen layout predecessor of \p BB
3203/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3204/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3205/// Used to identify which blocks to update predecessor
3206/// counts.
3207/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3208/// chosen in the given order due to unnatural CFG
3209/// only needed if \p BB is removed and
3210/// \p PrevUnplacedBlockIt pointed to \p BB.
3211/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3212/// \return - True if the block was duplicated into all preds and removed.
3213bool MachineBlockPlacement::maybeTailDuplicateBlock(
3214 MachineBasicBlock *BB, MachineBasicBlock *LPred, BlockChain &Chain,
3215 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3216 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
3217 bool &DuplicatedToLPred) {
3218 DuplicatedToLPred = false;
3219 if (!shouldTailDuplicate(BB))
3220 return false;
3221
3222 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3223 << "\n");
3224
3225 // This has to be a callback because none of it can be done after
3226 // BB is deleted.
3227 bool Removed = false;
3228 auto RemovalCallback = [&](MachineBasicBlock *RemBB) {
3229 // Signal to outer function
3230 Removed = true;
3231
3232 // Remove from the Chain and Chain Map
3233 if (auto It = BlockToChain.find(RemBB); It != BlockToChain.end()) {
3234 It->second->remove(RemBB);
3235 BlockToChain.erase(It);
3236 }
3237
3238 // Handle the unplaced block iterator
3239 if (&(*PrevUnplacedBlockIt) == RemBB) {
3240 PrevUnplacedBlockIt++;
3241 }
3242
3243 // Handle the Work Lists
3244 if (RemBB->isEHPad()) {
3245 llvm::erase(EHPadWorkList, RemBB);
3246 } else {
3247 llvm::erase(BlockWorkList, RemBB);
3248 }
3249
3250 // Handle the filter set
3251 if (BlockFilter) {
3252 auto It = llvm::find(*BlockFilter, RemBB);
3253 // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
3254 // pointing to the same element as before.
3255 if (It != BlockFilter->end()) {
3256 if (It < PrevUnplacedBlockInFilterIt) {
3257 const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt;
3258 // BlockFilter is a SmallVector so all elements after RemBB are
3259 // shifted to the front by 1 after its deletion.
3260 auto Distance = PrevUnplacedBlockInFilterIt - It - 1;
3261 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance;
3262 assert(*PrevUnplacedBlockInFilterIt == PrevBB);
3263 (void)PrevBB;
3264 } else if (It == PrevUnplacedBlockInFilterIt)
3265 // The block pointed by PrevUnplacedBlockInFilterIt is erased, we
3266 // have to set it to the next element.
3267 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It);
3268 else
3269 BlockFilter->erase(It);
3270 }
3271 }
3272
3273 // Remove the block from loop info.
3274 MLI->removeBlock(RemBB);
3275 if (RemBB == PreferredLoopExit)
3276 PreferredLoopExit = nullptr;
3277
3278 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB)
3279 << "\n");
3280 };
3281 auto RemovalCallbackRef =
3282 function_ref<void(MachineBasicBlock *)>(RemovalCallback);
3283
3285 bool IsSimple = TailDup.isSimpleBB(BB);
3287 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3288 if (F->getFunction().hasProfileData()) {
3289 // We can do partial duplication with precise profile information.
3290 findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3291 if (CandidatePreds.size() == 0)
3292 return false;
3293 if (CandidatePreds.size() < BB->pred_size())
3294 CandidatePtr = &CandidatePreds;
3295 }
3296 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3297 &RemovalCallbackRef, CandidatePtr);
3298
3299 // Update UnscheduledPredecessors to reflect tail-duplication.
3300 DuplicatedToLPred = false;
3301 for (MachineBasicBlock *Pred : DuplicatedPreds) {
3302 // We're only looking for unscheduled predecessors that match the filter.
3303 BlockChain *PredChain = BlockToChain[Pred];
3304 if (Pred == LPred)
3305 DuplicatedToLPred = true;
3306 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) ||
3307 PredChain == &Chain)
3308 continue;
3309 for (MachineBasicBlock *NewSucc : Pred->successors()) {
3310 if (BlockFilter && !BlockFilter->count(NewSucc))
3311 continue;
3312 BlockChain *NewChain = BlockToChain[NewSucc];
3313 if (NewChain != &Chain && NewChain != PredChain)
3314 NewChain->UnscheduledPredecessors++;
3315 }
3316 }
3317 return Removed;
3318}
3319
3320// Count the number of actual machine instructions.
3322 uint64_t InstrCount = 0;
3323 for (MachineInstr &MI : *MBB) {
3324 if (!MI.isPHI() && !MI.isMetaInstruction())
3325 InstrCount += 1;
3326 }
3327 return InstrCount;
3328}
3329
3330// The size cost of duplication is the instruction size of the duplicated block.
3331// So we should scale the threshold accordingly. But the instruction size is not
3332// available on all targets, so we use the number of instructions instead.
3333BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3334 return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB));
3335}
3336
3337// Returns true if BB is Pred's best successor.
3338bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3339 MachineBasicBlock *Pred,
3340 BlockFilterSet *BlockFilter) {
3341 if (BB == Pred)
3342 return false;
3343 if (BlockFilter && !BlockFilter->count(Pred))
3344 return false;
3345 BlockChain *PredChain = BlockToChain[Pred];
3346 if (PredChain && (Pred != *std::prev(PredChain->end())))
3347 return false;
3348
3349 // Find the successor with largest probability excluding BB.
3350 BranchProbability BestProb = BranchProbability::getZero();
3351 for (MachineBasicBlock *Succ : Pred->successors())
3352 if (Succ != BB) {
3353 if (BlockFilter && !BlockFilter->count(Succ))
3354 continue;
3355 BlockChain *SuccChain = BlockToChain[Succ];
3356 if (SuccChain && (Succ != *SuccChain->begin()))
3357 continue;
3358 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3359 if (SuccProb > BestProb)
3360 BestProb = SuccProb;
3361 }
3362
3363 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3364 if (BBProb <= BestProb)
3365 return false;
3366
3367 // Compute the number of reduced taken branches if Pred falls through to BB
3368 // instead of another successor. Then compare it with threshold.
3369 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3370 BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3371 return Gain > scaleThreshold(BB);
3372}
3373
3374// Find out the predecessors of BB and BB can be beneficially duplicated into
3375// them.
3376void MachineBlockPlacement::findDuplicateCandidates(
3377 SmallVectorImpl<MachineBasicBlock *> &Candidates, MachineBasicBlock *BB,
3378 BlockFilterSet *BlockFilter) {
3379 MachineBasicBlock *Fallthrough = nullptr;
3380 BranchProbability DefaultBranchProb = BranchProbability::getZero();
3381 BlockFrequency BBDupThreshold(scaleThreshold(BB));
3384
3385 // Sort for highest frequency.
3386 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3387 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3388 };
3389 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3390 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3391 };
3392 llvm::stable_sort(Succs, CmpSucc);
3393 llvm::stable_sort(Preds, CmpPred);
3394
3395 auto SuccIt = Succs.begin();
3396 if (SuccIt != Succs.end()) {
3397 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3398 }
3399
3400 // For each predecessors of BB, compute the benefit of duplicating BB,
3401 // if it is larger than the threshold, add it into Candidates.
3402 //
3403 // If we have following control flow.
3404 //
3405 // PB1 PB2 PB3 PB4
3406 // \ | / /\
3407 // \ | / / \
3408 // \ |/ / \
3409 // BB----/ OB
3410 // /\
3411 // / \
3412 // SB1 SB2
3413 //
3414 // And it can be partially duplicated as
3415 //
3416 // PB2+BB
3417 // | PB1 PB3 PB4
3418 // | | / /\
3419 // | | / / \
3420 // | |/ / \
3421 // | BB----/ OB
3422 // |\ /|
3423 // | X |
3424 // |/ \|
3425 // SB2 SB1
3426 //
3427 // The benefit of duplicating into a predecessor is defined as
3428 // Orig_taken_branch - Duplicated_taken_branch
3429 //
3430 // The Orig_taken_branch is computed with the assumption that predecessor
3431 // jumps to BB and the most possible successor is laid out after BB.
3432 //
3433 // The Duplicated_taken_branch is computed with the assumption that BB is
3434 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3435 // SB2 for PB2 in our case). If there is no available successor, the combined
3436 // block jumps to all BB's successor, like PB3 in this example.
3437 //
3438 // If a predecessor has multiple successors, so BB can't be duplicated into
3439 // it. But it can beneficially fall through to BB, and duplicate BB into other
3440 // predecessors.
3441 for (MachineBasicBlock *Pred : Preds) {
3442 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3443
3444 if (!TailDup.canTailDuplicate(BB, Pred)) {
3445 // BB can't be duplicated into Pred, but it is possible to be layout
3446 // below Pred.
3447 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3448 Fallthrough = Pred;
3449 if (SuccIt != Succs.end())
3450 SuccIt++;
3451 }
3452 continue;
3453 }
3454
3455 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3456 BlockFrequency DupCost;
3457 if (SuccIt == Succs.end()) {
3458 // Jump to all successors;
3459 if (Succs.size() > 0)
3460 DupCost += PredFreq;
3461 } else {
3462 // Fallthrough to *SuccIt, jump to all other successors;
3463 DupCost += PredFreq;
3464 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3465 }
3466
3467 assert(OrigCost >= DupCost);
3468 OrigCost -= DupCost;
3469 if (OrigCost > BBDupThreshold) {
3470 Candidates.push_back(Pred);
3471 if (SuccIt != Succs.end())
3472 SuccIt++;
3473 }
3474 }
3475
3476 // No predecessors can optimally fallthrough to BB.
3477 // So we can change one duplication into fallthrough.
3478 if (!Fallthrough) {
3479 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3480 Candidates[0] = Candidates.back();
3481 Candidates.pop_back();
3482 }
3483 }
3484}
3485
3486void MachineBlockPlacement::initTailDupThreshold() {
3487 DupThreshold = BlockFrequency(0);
3488 if (F->getFunction().hasProfileData()) {
3489 // We prefer to use prifile count.
3490 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3491 if (HotThreshold != UINT64_MAX) {
3492 UseProfileCount = true;
3493 DupThreshold =
3494 BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
3495 } else {
3496 // Profile count is not available, we can use block frequency instead.
3497 BlockFrequency MaxFreq = BlockFrequency(0);
3498 for (MachineBasicBlock &MBB : *F) {
3499 BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3500 if (Freq > MaxFreq)
3501 MaxFreq = Freq;
3502 }
3503
3504 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3505 DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3506 UseProfileCount = false;
3507 }
3508 }
3509
3510 TailDupSize = TailDupPlacementThreshold;
3511 // If only the aggressive threshold is explicitly set, use it.
3512 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3513 TailDupPlacementThreshold.getNumOccurrences() == 0)
3515
3516 // For aggressive optimization, we can adjust some thresholds to be less
3517 // conservative.
3518 if (OptLevel >= CodeGenOptLevel::Aggressive) {
3519 // At O3 we should be more willing to copy blocks for tail duplication. This
3520 // increases size pressure, so we only do it at O3
3521 // Do this unless only the regular threshold is explicitly set.
3522 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3523 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3525 }
3526
3527 // If there's no threshold provided through options, query the target
3528 // information for a threshold instead.
3529 if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3530 (OptLevel < CodeGenOptLevel::Aggressive ||
3531 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3532 TailDupSize = TII->getTailDuplicateSize(OptLevel);
3533}
3534
3535PreservedAnalyses
3538 auto *MBPI = &MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
3539 auto MBFI = std::make_unique<MBFIWrapper>(
3541 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
3542 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
3544 : nullptr;
3546 .getCachedResult<ProfileSummaryAnalysis>(
3547 *MF.getFunction().getParent());
3548 if (!PSI)
3549 report_fatal_error("MachineBlockPlacement requires ProfileSummaryAnalysis",
3550 false);
3551 MachineBlockPlacement MBP(MBPI, MLI, PSI, std::move(MBFI), MPDT,
3552 AllowTailMerge);
3553
3554 if (MBP.run(MF))
3556
3557 return PreservedAnalyses::all();
3558}
3559
3561 raw_ostream &OS,
3562 function_ref<StringRef(StringRef)> MapClassName2PassName) const {
3563 OS << MapClassName2PassName(name());
3564 if (!AllowTailMerge)
3565 OS << "<no-tail-merge>";
3566}
3567
3568bool MachineBlockPlacement::run(MachineFunction &MF) {
3569
3570 // Check for single-block functions and skip them.
3571 if (std::next(MF.begin()) == MF.end())
3572 return false;
3573
3574 F = &MF;
3575 OptLevel = F->getTarget().getOptLevel();
3576
3577 TII = MF.getSubtarget().getInstrInfo();
3578 TLI = MF.getSubtarget().getTargetLowering();
3579
3580 // Initialize PreferredLoopExit to nullptr here since it may never be set if
3581 // there are no MachineLoops.
3582 PreferredLoopExit = nullptr;
3583
3584 assert(BlockToChain.empty() &&
3585 "BlockToChain map should be empty before starting placement.");
3586 assert(ComputedEdges.empty() &&
3587 "Computed Edge map should be empty before starting placement.");
3588
3589 // Initialize tail duplication thresholds.
3590 initTailDupThreshold();
3591
3592 const bool OptForSize =
3593 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3594 // Determine whether to use ext-tsp for perf/size optimization. The method
3595 // is beneficial only for instances with at least 3 basic blocks and it can be
3596 // disabled for huge functions (exceeding a certain size).
3597 bool UseExtTspForPerf = false;
3598 bool UseExtTspForSize = false;
3599 if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3600 UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3601 UseExtTspForPerf =
3602 !UseExtTspForSize && EnableExtTspBlockPlacement &&
3604 }
3605
3606 // Apply tail duplication.
3607 if (allowTailDupPlacement(*F)) {
3608 if (OptForSize)
3609 TailDupSize = 1;
3610 const bool PreRegAlloc = false;
3611 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3612 /* LayoutMode */ true, TailDupSize);
3613 if (!UseExtTspForSize)
3614 precomputeTriangleChains();
3615 }
3616
3617 // Run the main block placement.
3618 if (!UseExtTspForSize)
3619 buildCFGChains();
3620
3621 // Changing the layout can create new tail merging opportunities.
3622 // TailMerge can create jump into if branches that make CFG irreducible for
3623 // HW that requires structured CFG.
3624 const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3625 AllowTailMerge && BranchFoldPlacement &&
3626 MF.size() > 3;
3627 // No tail merging opportunities if the block number is less than four.
3628 if (EnableTailMerge) {
3629 const unsigned TailMergeSize = TailDupSize + 1;
3630 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3631 *MBFI, *MBPI, PSI, TailMergeSize);
3632
3633 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3634 /*AfterPlacement=*/true)) {
3635 // Must redo the post-dominator tree if blocks were changed.
3636 if (MPDT)
3637 MPDT->recalculate(MF);
3638 if (!UseExtTspForSize) {
3639 // Redo the layout if tail merging creates/removes/moves blocks.
3640 BlockToChain.clear();
3641 ComputedEdges.clear();
3642 ChainAllocator.DestroyAll();
3643 buildCFGChains();
3644 }
3645 }
3646 }
3647
3648 // Apply a post-processing optimizing block placement:
3649 // - find a new placement and modify the layout of the blocks in the function;
3650 // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3651 if (UseExtTspForPerf || UseExtTspForSize) {
3652 assert(
3653 !(UseExtTspForPerf && UseExtTspForSize) &&
3654 "UseExtTspForPerf and UseExtTspForSize can not be set simultaneously");
3655 applyExtTsp(/*OptForSize=*/UseExtTspForSize);
3656 createCFGChainExtTsp();
3657 }
3658
3659 optimizeBranches();
3660 alignBlocks();
3661
3662 BlockToChain.clear();
3663 ComputedEdges.clear();
3664 ChainAllocator.DestroyAll();
3665
3666 // View the function.
3668 (ViewBlockFreqFuncName.empty() ||
3669 F->getFunction().getName() == ViewBlockFreqFuncName)) {
3671 MF.RenumberBlocks();
3672 MBFI->view("MBP." + MF.getName(), false);
3673 }
3674
3675 // We always return true as we have no way to track whether the final order
3676 // differs from the original order.
3677 return true;
3678}
3679
3680void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
3681 // Prepare data; blocks are indexed by their index in the current ordering.
3682 DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
3683 BlockIndex.reserve(F->size());
3684 std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3685 CurrentBlockOrder.reserve(F->size());
3686 size_t NumBlocks = 0;
3687 for (const MachineBasicBlock &MBB : *F) {
3688 BlockIndex[&MBB] = NumBlocks++;
3689 CurrentBlockOrder.push_back(&MBB);
3690 }
3691
3692 SmallVector<uint64_t, 0> BlockCounts(F->size());
3693 SmallVector<uint64_t, 0> BlockSizes(F->size());
3695 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
3697 for (MachineBasicBlock &MBB : *F) {
3698 // Getting the block frequency.
3699 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3700 BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
3701 // Getting the block size:
3702 // - approximate the size of an instruction by 4 bytes, and
3703 // - ignore debug instructions.
3704 // Note: getting the exact size of each block is target-dependent and can be
3705 // done by extending the interface of MCCodeEmitter. Experimentally we do
3706 // not see a perf improvement with the exact block sizes.
3707 auto NonDbgInsts =
3709 size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3710 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3711
3712 // Getting jump frequencies.
3713 if (OptForSize) {
3714 Cond.clear();
3715 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
3716 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3717 continue;
3718
3719 const MachineBasicBlock *FTB = MBB.getFallThrough();
3720 // Succs is a collection of distinct destinations of the block reachable
3721 // from MBB via a jump instruction; initialize the list using the three
3722 // (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3723 Succs.clear();
3724 if (TBB && TBB != FTB)
3725 Succs.push_back(TBB);
3726 if (FBB && FBB != FTB)
3727 Succs.push_back(FBB);
3728 if (FTB)
3729 Succs.push_back(FTB);
3730 // Absolute magnitude of non-zero counts does not matter for the
3731 // optimization; prioritize slightly jumps with a single successor, since
3732 // the corresponding jump instruction will be removed from the binary.
3733 const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
3734 for (const MachineBasicBlock *Succ : Succs)
3735 JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3736 } else {
3737 for (MachineBasicBlock *Succ : MBB.successors()) {
3738 auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3739 BlockFrequency JumpFreq = BlockFreq * EP;
3740 JumpCounts.push_back(
3741 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3742 }
3743 }
3744 }
3745
3746 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3747 << " with profile = " << F->getFunction().hasProfileData()
3748 << " (" << F->getName() << ")" << "\n");
3749
3750 const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts);
3751 LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
3752
3753 // Run the layout algorithm.
3754 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3755 std::vector<const MachineBasicBlock *> NewBlockOrder;
3756 NewBlockOrder.reserve(F->size());
3757 for (uint64_t Node : NewOrder) {
3758 NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3759 }
3760 const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts);
3761 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
3762
3763 // If the optimization is unsuccessful, fall back to the original block order.
3764 if (OptForSize && OrgScore > OptScore)
3765 assignBlockOrder(CurrentBlockOrder);
3766 else
3767 assignBlockOrder(NewBlockOrder);
3768}
3769
3770void MachineBlockPlacement::assignBlockOrder(
3771 const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3772 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3773 F->RenumberBlocks();
3774
3775 bool HasChanges = false;
3776 for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3777 if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3778 HasChanges = true;
3779 break;
3780 }
3781 }
3782 // Stop early if the new block order is identical to the existing one.
3783 if (!HasChanges)
3784 return;
3785
3786 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3787 for (auto &MBB : *F) {
3788 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3789 }
3790
3791 // Sort basic blocks in the function according to the computed order.
3792 DenseMap<const MachineBasicBlock *, size_t> NewIndex;
3793 for (const MachineBasicBlock *MBB : NewBlockOrder) {
3794 NewIndex[MBB] = NewIndex.size();
3795 }
3796 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3797 return NewIndex[&L] < NewIndex[&R];
3798 });
3799
3800 // Update basic block branches by inserting explicit fallthrough branches
3801 // when required and re-optimize branches when possible.
3802 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3804 for (auto &MBB : *F) {
3805 MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3807 auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3808 // If this block had a fallthrough before we need an explicit unconditional
3809 // branch to that block if the fallthrough block is not adjacent to the
3810 // block in the new order.
3811 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3812 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3813 }
3814
3815 // It might be possible to optimize branches by flipping the condition.
3816 Cond.clear();
3817 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3818 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3819 continue;
3820 MBB.updateTerminator(FTMBB);
3821 }
3822}
3823
3824void MachineBlockPlacement::createCFGChainExtTsp() {
3825 BlockToChain.clear();
3826 ComputedEdges.clear();
3827 ChainAllocator.DestroyAll();
3828
3829 MachineBasicBlock *HeadBB = &F->front();
3830 BlockChain *FunctionChain =
3831 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3832
3833 for (MachineBasicBlock &MBB : *F) {
3834 if (HeadBB == &MBB)
3835 continue; // Ignore head of the chain
3836 FunctionChain->merge(&MBB, nullptr);
3837 }
3838}
3839
3840namespace {
3841
3842/// A pass to compute block placement statistics.
3843///
3844/// A separate pass to compute interesting statistics for evaluating block
3845/// placement. This is separate from the actual placement pass so that they can
3846/// be computed in the absence of any placement transformations or when using
3847/// alternative placement strategies.
3848class MachineBlockPlacementStats {
3849 /// A handle to the branch probability pass.
3850 const MachineBranchProbabilityInfo *MBPI;
3851
3852 /// A handle to the function-wide block frequency pass.
3853 const MachineBlockFrequencyInfo *MBFI;
3854
3855public:
3856 MachineBlockPlacementStats(const MachineBranchProbabilityInfo *MBPI,
3857 const MachineBlockFrequencyInfo *MBFI)
3858 : MBPI(MBPI), MBFI(MBFI) {}
3859 bool run(MachineFunction &MF);
3860};
3861
3862class MachineBlockPlacementStatsLegacy : public MachineFunctionPass {
3863public:
3864 static char ID; // Pass identification, replacement for typeid
3865
3866 MachineBlockPlacementStatsLegacy() : MachineFunctionPass(ID) {}
3867
3868 bool runOnMachineFunction(MachineFunction &F) override {
3869 auto *MBPI =
3870 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3871 auto *MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
3872 return MachineBlockPlacementStats(MBPI, MBFI).run(F);
3873 }
3874
3875 void getAnalysisUsage(AnalysisUsage &AU) const override {
3876 AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
3877 AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
3878 AU.setPreservesAll();
3880 }
3881};
3882
3883} // end anonymous namespace
3884
3885char MachineBlockPlacementStatsLegacy::ID = 0;
3886
3887char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStatsLegacy::ID;
3888
3889INITIALIZE_PASS_BEGIN(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3890 "Basic Block Placement Stats", false, false)
3893INITIALIZE_PASS_END(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3894 "Basic Block Placement Stats", false, false)
3895
3899 auto &MBPI = MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
3900 auto &MBFI = MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
3901
3902 MachineBlockPlacementStats(&MBPI, &MBFI).run(MF);
3903 return PreservedAnalyses::all();
3904}
3905
3906bool MachineBlockPlacementStats::run(MachineFunction &F) {
3907 // Check for single-block functions and skip them.
3908 if (std::next(F.begin()) == F.end())
3909 return false;
3910
3911 if (!isFunctionInPrintList(F.getName()))
3912 return false;
3913
3914 for (MachineBasicBlock &MBB : F) {
3915 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3916 Statistic &NumBranches =
3917 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3918 Statistic &BranchTakenFreq =
3919 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3920 for (MachineBasicBlock *Succ : MBB.successors()) {
3921 // Skip if this successor is a fallthrough.
3922 if (MBB.isLayoutSuccessor(Succ))
3923 continue;
3924
3925 BlockFrequency EdgeFreq =
3926 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3927 ++NumBranches;
3928 BranchTakenFreq += EdgeFreq.getFrequency();
3929 }
3930 }
3931
3932 return false;
3933}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
This file defines the BumpPtrAllocator interface.
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares methods and data structures for code layout algorithms.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
static unsigned InstrCount
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExtTspBlockPlacementMaxBlocks("ext-tsp-block-placement-max-blocks", cl::desc("Maximum number of basic blocks in a function to run ext-TSP " "block placement."), cl::init(UINT_MAX), cl::Hidden)
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > PredecessorLimit("block-placement-predecessor-limit", cl::desc("For blocks with more predecessors, certain layout optimizations" "will be disabled to prevent quadratic compile time."), cl::init(1000), cl::Hidden)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
static cl::opt< bool > ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden, cl::desc("Use ext-tsp for size-aware block placement."))
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunities in " "outline branches."), cl::init(true), cl::Hidden)
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
#define P(N)
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
static const char * name
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
static BlockFrequency max()
Returns the maximum possible frequency, the saturation value.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static BranchProbability getOne()
uint32_t getNumerator() const
BranchProbability getCompl() const
static BranchProbability getZero()
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition Function.h:336
Module * getParent()
Get the module that this global value is contained inside of...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1450
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:632
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void setAlignment(Align A)
Set alignment of the basic block.
LLVM_ABI bool isEntryBlock() const
Returns true if this is the entry block of the function.
succ_reverse_iterator succ_rbegin()
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
LLVM_ABI DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
LLVM_ABI bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
iterator_range< pred_iterator > predecessors()
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
LLVM_ABI Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Representation of each machine instruction.
Analysis pass that exposes the MachineLoopInfo for a machine function.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
LLVM_ABI uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
typename vector_type::const_iterator iterator
Definition SetVector.h:72
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
reference emplace_back(ArgTypes &&... Args)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition Allocator.h:440
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition Allocator.h:411
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
bool requiresStructuredCFG() const
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const TargetLowering * getTargetLowering() const
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
#define UINT64_MAX
Definition DataTypes.h:77
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
LLVM_ABI double calcExtTspScore(ArrayRef< uint64_t > Order, ArrayRef< uint64_t > NodeSizes, ArrayRef< EdgeCount > EdgeCounts)
Estimate the "quality" of a given node order in CFG.
LLVM_ABI std::vector< uint64_t > computeExtTspLayout(ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition Path.cpp:457
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
cl::opt< bool > ApplyExtTspWithoutProfile
constexpr from_range_t from_range
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
cl::opt< unsigned > ProfileLikelyProb
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2200
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
NoopStatistic Statistic
Definition Statistic.h:162
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
bool isFunctionInPrintList(StringRef FunctionName)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
cl::opt< unsigned > StaticLikelyProb
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
LLVM_ABI Printable printBlockFreq(const BlockFrequencyInfo &BFI, BlockFrequency Freq)
Print the block frequency Freq relative to the current functions entry frequency.
LLVM_ABI char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
cl::opt< bool > EnableExtTspBlockPlacement
LLVM_ABI char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872