LLVM  15.0.0git
HotColdSplitting.cpp
Go to the documentation of this file.
1 //===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// The goal of hot/cold splitting is to improve the memory locality of code.
11 /// The splitting pass does this by identifying cold blocks and moving them into
12 /// separate functions.
13 ///
14 /// When the splitting pass finds a cold block (referred to as "the sink"), it
15 /// grows a maximal cold region around that block. The maximal region contains
16 /// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
17 /// cold as the sink. Once a region is found, it's split out of the original
18 /// function provided it's profitable to do so.
19 ///
20 /// [*] In practice, there is some added complexity because some blocks are not
21 /// safe to extract.
22 ///
23 /// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
24 /// TODO: Reorder outlined functions.
25 ///
26 //===----------------------------------------------------------------------===//
27 
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
38 #include "llvm/IR/BasicBlock.h"
39 #include "llvm/IR/CFG.h"
40 #include "llvm/IR/DiagnosticInfo.h"
41 #include "llvm/IR/Dominators.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/Instruction.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/Module.h"
46 #include "llvm/IR/PassManager.h"
47 #include "llvm/IR/User.h"
48 #include "llvm/IR/Value.h"
49 #include "llvm/InitializePasses.h"
50 #include "llvm/Pass.h"
52 #include "llvm/Support/Debug.h"
54 #include "llvm/Transforms/IPO.h"
56 #include <algorithm>
57 #include <cassert>
58 #include <limits>
59 #include <string>
60 
61 #define DEBUG_TYPE "hotcoldsplit"
62 
63 STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
64 STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
65 
66 using namespace llvm;
67 
68 static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
69  cl::init(true), cl::Hidden);
70 
71 static cl::opt<int>
72  SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
73  cl::desc("Base penalty for splitting cold code (as a "
74  "multiple of TCC_Basic)"));
75 
77  "enable-cold-section", cl::init(false), cl::Hidden,
78  cl::desc("Enable placement of extracted cold functions"
79  " into a separate section after hot-cold splitting."));
80 
82  ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"),
83  cl::Hidden,
84  cl::desc("Name for the section containing cold functions "
85  "extracted by hot-cold splitting."));
86 
88  "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
89  cl::desc("Maximum number of parameters for a split function"));
90 
91 namespace {
92 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
93 // this function unless you modify the MBB version as well.
94 //
95 /// A no successor, non-return block probably ends in unreachable and is cold.
96 /// Also consider a block that ends in an indirect branch to be a return block,
97 /// since many targets use plain indirect branches to return.
99  if (!succ_empty(&BB))
100  return false;
101  if (BB.empty())
102  return true;
103  const Instruction *I = BB.getTerminator();
104  return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
105 }
106 
107 bool unlikelyExecuted(BasicBlock &BB) {
108  // Exception handling blocks are unlikely executed.
109  if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
110  return true;
111 
112  // The block is cold if it calls/invokes a cold function. However, do not
113  // mark sanitizer traps as cold.
114  for (Instruction &I : BB)
115  if (auto *CB = dyn_cast<CallBase>(&I))
116  if (CB->hasFnAttr(Attribute::Cold) && !CB->getMetadata("nosanitize"))
117  return true;
118 
119  // The block is cold if it has an unreachable terminator, unless it's
120  // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
121  if (blockEndsInUnreachable(BB)) {
122  if (auto *CI =
123  dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
124  if (CI->hasFnAttr(Attribute::NoReturn))
125  return false;
126  return true;
127  }
128 
129  return false;
130 }
131 
132 /// Check whether it's safe to outline \p BB.
133 static bool mayExtractBlock(const BasicBlock &BB) {
134  // EH pads are unsafe to outline because doing so breaks EH type tables. It
135  // follows that invoke instructions cannot be extracted, because CodeExtractor
136  // requires unwind destinations to be within the extraction region.
137  //
138  // Resumes that are not reachable from a cleanup landing pad are considered to
139  // be unreachable. It’s not safe to split them out either.
140  if (BB.hasAddressTaken() || BB.isEHPad())
141  return false;
142  auto Term = BB.getTerminator();
143  return !isa<InvokeInst>(Term) && !isa<ResumeInst>(Term);
144 }
145 
146 /// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
147 /// If \p UpdateEntryCount is true (set when this is a new split function and
148 /// module has profile data), set entry count to 0 to ensure treated as cold.
149 /// Return true if the function is changed.
150 static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
151  assert(!F.hasOptNone() && "Can't mark this cold");
152  bool Changed = false;
153  if (!F.hasFnAttribute(Attribute::Cold)) {
154  F.addFnAttr(Attribute::Cold);
155  Changed = true;
156  }
157  if (!F.hasFnAttribute(Attribute::MinSize)) {
158  F.addFnAttr(Attribute::MinSize);
159  Changed = true;
160  }
161  if (UpdateEntryCount) {
162  // Set the entry count to 0 to ensure it is placed in the unlikely text
163  // section when function sections are enabled.
164  F.setEntryCount(0);
165  Changed = true;
166  }
167 
168  return Changed;
169 }
170 
171 class HotColdSplittingLegacyPass : public ModulePass {
172 public:
173  static char ID;
174  HotColdSplittingLegacyPass() : ModulePass(ID) {
176  }
177 
178  void getAnalysisUsage(AnalysisUsage &AU) const override {
183  }
184 
185  bool runOnModule(Module &M) override;
186 };
187 
188 } // end anonymous namespace
189 
190 /// Check whether \p F is inherently cold.
191 bool HotColdSplitting::isFunctionCold(const Function &F) const {
192  if (F.hasFnAttribute(Attribute::Cold))
193  return true;
194 
195  if (F.getCallingConv() == CallingConv::Cold)
196  return true;
197 
198  if (PSI->isFunctionEntryCold(&F))
199  return true;
200 
201  return false;
202 }
203 
204 // Returns false if the function should not be considered for hot-cold split
205 // optimization.
206 bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
207  if (F.hasFnAttribute(Attribute::AlwaysInline))
208  return false;
209 
210  if (F.hasFnAttribute(Attribute::NoInline))
211  return false;
212 
213  // A function marked `noreturn` may contain unreachable terminators: these
214  // should not be considered cold, as the function may be a trampoline.
215  if (F.hasFnAttribute(Attribute::NoReturn))
216  return false;
217 
218  if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
219  F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
220  F.hasFnAttribute(Attribute::SanitizeThread) ||
221  F.hasFnAttribute(Attribute::SanitizeMemory))
222  return false;
223 
224  return true;
225 }
226 
227 /// Get the benefit score of outlining \p Region.
230  // Sum up the code size costs of non-terminator instructions. Tight coupling
231  // with \ref getOutliningPenalty is needed to model the costs of terminators.
232  InstructionCost Benefit = 0;
233  for (BasicBlock *BB : Region)
234  for (Instruction &I : BB->instructionsWithoutDebug())
235  if (&I != BB->getTerminator())
236  Benefit +=
238 
239  return Benefit;
240 }
241 
242 /// Get the penalty score for outlining \p Region.
244  unsigned NumInputs, unsigned NumOutputs) {
245  int Penalty = SplittingThreshold;
246  LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
247 
248  // If the splitting threshold is set at or below zero, skip the usual
249  // profitability check.
250  if (SplittingThreshold <= 0)
251  return Penalty;
252 
253  // Find the number of distinct exit blocks for the region. Use a conservative
254  // check to determine whether control returns from the region.
255  bool NoBlocksReturn = true;
256  SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
257  for (BasicBlock *BB : Region) {
258  // If a block has no successors, only assume it does not return if it's
259  // unreachable.
260  if (succ_empty(BB)) {
261  NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
262  continue;
263  }
264 
265  for (BasicBlock *SuccBB : successors(BB)) {
266  if (!is_contained(Region, SuccBB)) {
267  NoBlocksReturn = false;
268  SuccsOutsideRegion.insert(SuccBB);
269  }
270  }
271  }
272 
273  // Count the number of phis in exit blocks with >= 2 incoming values from the
274  // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
275  // and new outputs are created to supply the split phis. CodeExtractor can't
276  // report these new outputs until extraction begins, but it's important to
277  // factor the cost of the outputs into the cost calculation.
278  unsigned NumSplitExitPhis = 0;
279  for (BasicBlock *ExitBB : SuccsOutsideRegion) {
280  for (PHINode &PN : ExitBB->phis()) {
281  // Find all incoming values from the outlining region.
282  int NumIncomingVals = 0;
283  for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
284  if (llvm::is_contained(Region, PN.getIncomingBlock(i))) {
285  ++NumIncomingVals;
286  if (NumIncomingVals > 1) {
287  ++NumSplitExitPhis;
288  break;
289  }
290  }
291  }
292  }
293 
294  // Apply a penalty for calling the split function. Factor in the cost of
295  // materializing all of the parameters.
296  int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
297  int NumParams = NumInputs + NumOutputsAndSplitPhis;
298  if (NumParams > MaxParametersForSplit) {
299  LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
300  << " outputs exceeds parameter limit ("
301  << MaxParametersForSplit << ")\n");
303  }
304  const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
305  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
306  Penalty += CostForArgMaterialization * NumParams;
307 
308  // Apply the typical code size cost for an output alloca and its associated
309  // reload in the caller. Also penalize the associated store in the callee.
310  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
311  << " outputs/split phis\n");
312  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
313  Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
314 
315  // Apply a `noreturn` bonus.
316  if (NoBlocksReturn) {
317  LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
318  << " non-returning terminators\n");
319  Penalty -= Region.size();
320  }
321 
322  // Apply a penalty for having more than one successor outside of the region.
323  // This penalty accounts for the switch needed in the caller.
324  if (SuccsOutsideRegion.size() > 1) {
325  LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
326  << " non-region successors\n");
327  Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
328  }
329 
330  return Penalty;
331 }
332 
333 Function *HotColdSplitting::extractColdRegion(
334  const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
336  OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
337  assert(!Region.empty());
338 
339  // TODO: Pass BFI and BPI to update profile information.
340  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
341  /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
342  /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
343  /* Suffix */ "cold." + std::to_string(Count));
344 
345  // Perform a simple cost/benefit analysis to decide whether or not to permit
346  // splitting.
347  SetVector<Value *> Inputs, Outputs, Sinks;
348  CE.findInputsOutputs(Inputs, Outputs, Sinks);
349  InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
350  int OutliningPenalty =
351  getOutliningPenalty(Region, Inputs.size(), Outputs.size());
352  LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
353  << ", penalty = " << OutliningPenalty << "\n");
354  if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
355  return nullptr;
356 
357  Function *OrigF = Region[0]->getParent();
358  if (Function *OutF = CE.extractCodeRegion(CEAC)) {
359  User *U = *OutF->user_begin();
360  CallInst *CI = cast<CallInst>(U);
361  NumColdRegionsOutlined++;
362  if (TTI.useColdCCForColdCall(*OutF)) {
363  OutF->setCallingConv(CallingConv::Cold);
365  }
366  CI->setIsNoInline();
367 
368  if (EnableColdSection)
369  OutF->setSection(ColdSectionName);
370  else {
371  if (OrigF->hasSection())
372  OutF->setSection(OrigF->getSection());
373  }
374 
375  markFunctionCold(*OutF, BFI != nullptr);
376 
377  LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
378  ORE.emit([&]() {
379  return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
380  &*Region[0]->begin())
381  << ore::NV("Original", OrigF) << " split cold code into "
382  << ore::NV("Split", OutF);
383  });
384  return OutF;
385  }
386 
387  ORE.emit([&]() {
388  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
389  &*Region[0]->begin())
390  << "Failed to extract region at block "
391  << ore::NV("Block", Region.front());
392  });
393  return nullptr;
394 }
395 
396 /// A pair of (basic block, score).
397 using BlockTy = std::pair<BasicBlock *, unsigned>;
398 
399 namespace {
400 /// A maximal outlining region. This contains all blocks post-dominated by a
401 /// sink block, the sink block itself, and all blocks dominated by the sink.
402 /// If sink-predecessors and sink-successors cannot be extracted in one region,
403 /// the static constructor returns a list of suitable extraction regions.
404 class OutliningRegion {
405  /// A list of (block, score) pairs. A block's score is non-zero iff it's a
406  /// viable sub-region entry point. Blocks with higher scores are better entry
407  /// points (i.e. they are more distant ancestors of the sink block).
408  SmallVector<BlockTy, 0> Blocks = {};
409 
410  /// The suggested entry point into the region. If the region has multiple
411  /// entry points, all blocks within the region may not be reachable from this
412  /// entry point.
413  BasicBlock *SuggestedEntryPoint = nullptr;
414 
415  /// Whether the entire function is cold.
416  bool EntireFunctionCold = false;
417 
418  /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
419  static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
420  return mayExtractBlock(BB) ? Score : 0;
421  }
422 
423  /// These scores should be lower than the score for predecessor blocks,
424  /// because regions starting at predecessor blocks are typically larger.
425  static constexpr unsigned ScoreForSuccBlock = 1;
426  static constexpr unsigned ScoreForSinkBlock = 1;
427 
428  OutliningRegion(const OutliningRegion &) = delete;
429  OutliningRegion &operator=(const OutliningRegion &) = delete;
430 
431 public:
432  OutliningRegion() = default;
433  OutliningRegion(OutliningRegion &&) = default;
434  OutliningRegion &operator=(OutliningRegion &&) = default;
435 
436  static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
437  const DominatorTree &DT,
438  const PostDominatorTree &PDT) {
439  std::vector<OutliningRegion> Regions;
440  SmallPtrSet<BasicBlock *, 4> RegionBlocks;
441 
442  Regions.emplace_back();
443  OutliningRegion *ColdRegion = &Regions.back();
444 
445  auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
446  RegionBlocks.insert(BB);
447  ColdRegion->Blocks.emplace_back(BB, Score);
448  };
449 
450  // The ancestor farthest-away from SinkBB, and also post-dominated by it.
451  unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
452  ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
453  unsigned BestScore = SinkScore;
454 
455  // Visit SinkBB's ancestors using inverse DFS.
456  auto PredIt = ++idf_begin(&SinkBB);
457  auto PredEnd = idf_end(&SinkBB);
458  while (PredIt != PredEnd) {
459  BasicBlock &PredBB = **PredIt;
460  bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
461 
462  // If the predecessor is cold and has no predecessors, the entire
463  // function must be cold.
464  if (SinkPostDom && pred_empty(&PredBB)) {
465  ColdRegion->EntireFunctionCold = true;
466  return Regions;
467  }
468 
469  // If SinkBB does not post-dominate a predecessor, do not mark the
470  // predecessor (or any of its predecessors) cold.
471  if (!SinkPostDom || !mayExtractBlock(PredBB)) {
472  PredIt.skipChildren();
473  continue;
474  }
475 
476  // Keep track of the post-dominated ancestor farthest away from the sink.
477  // The path length is always >= 2, ensuring that predecessor blocks are
478  // considered as entry points before the sink block.
479  unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
480  if (PredScore > BestScore) {
481  ColdRegion->SuggestedEntryPoint = &PredBB;
482  BestScore = PredScore;
483  }
484 
485  addBlockToRegion(&PredBB, PredScore);
486  ++PredIt;
487  }
488 
489  // If the sink can be added to the cold region, do so. It's considered as
490  // an entry point before any sink-successor blocks.
491  //
492  // Otherwise, split cold sink-successor blocks using a separate region.
493  // This satisfies the requirement that all extraction blocks other than the
494  // first have predecessors within the extraction region.
495  if (mayExtractBlock(SinkBB)) {
496  addBlockToRegion(&SinkBB, SinkScore);
497  if (pred_empty(&SinkBB)) {
498  ColdRegion->EntireFunctionCold = true;
499  return Regions;
500  }
501  } else {
502  Regions.emplace_back();
503  ColdRegion = &Regions.back();
504  BestScore = 0;
505  }
506 
507  // Find all successors of SinkBB dominated by SinkBB using DFS.
508  auto SuccIt = ++df_begin(&SinkBB);
509  auto SuccEnd = df_end(&SinkBB);
510  while (SuccIt != SuccEnd) {
511  BasicBlock &SuccBB = **SuccIt;
512  bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
513 
514  // Don't allow the backwards & forwards DFSes to mark the same block.
515  bool DuplicateBlock = RegionBlocks.count(&SuccBB);
516 
517  // If SinkBB does not dominate a successor, do not mark the successor (or
518  // any of its successors) cold.
519  if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
520  SuccIt.skipChildren();
521  continue;
522  }
523 
524  unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
525  if (SuccScore > BestScore) {
526  ColdRegion->SuggestedEntryPoint = &SuccBB;
527  BestScore = SuccScore;
528  }
529 
530  addBlockToRegion(&SuccBB, SuccScore);
531  ++SuccIt;
532  }
533 
534  return Regions;
535  }
536 
537  /// Whether this region has nothing to extract.
538  bool empty() const { return !SuggestedEntryPoint; }
539 
540  /// The blocks in this region.
541  ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
542 
543  /// Whether the entire function containing this region is cold.
544  bool isEntireFunctionCold() const { return EntireFunctionCold; }
545 
546  /// Remove a sub-region from this region and return it as a block sequence.
547  BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
548  assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
549 
550  // Remove blocks dominated by the suggested entry point from this region.
551  // During the removal, identify the next best entry point into the region.
552  // Ensure that the first extracted block is the suggested entry point.
553  BlockSequence SubRegion = {SuggestedEntryPoint};
554  BasicBlock *NextEntryPoint = nullptr;
555  unsigned NextScore = 0;
556  auto RegionEndIt = Blocks.end();
557  auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
558  BasicBlock *BB = Block.first;
559  unsigned Score = Block.second;
560  bool InSubRegion =
561  BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
562  if (!InSubRegion && Score > NextScore) {
563  NextEntryPoint = BB;
564  NextScore = Score;
565  }
566  if (InSubRegion && BB != SuggestedEntryPoint)
567  SubRegion.push_back(BB);
568  return InSubRegion;
569  });
570  Blocks.erase(RegionStartIt, RegionEndIt);
571 
572  // Update the suggested entry point.
573  SuggestedEntryPoint = NextEntryPoint;
574 
575  return SubRegion;
576  }
577 };
578 } // namespace
579 
580 bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
581  bool Changed = false;
582 
583  // The set of cold blocks.
584  SmallPtrSet<BasicBlock *, 4> ColdBlocks;
585 
586  // The worklist of non-intersecting regions left to outline.
587  SmallVector<OutliningRegion, 2> OutliningWorklist;
588 
589  // Set up an RPO traversal. Experimentally, this performs better (outlines
590  // more) than a PO traversal, because we prevent region overlap by keeping
591  // the first region to contain a block.
593 
594  // Calculate domtrees lazily. This reduces compile-time significantly.
595  std::unique_ptr<DominatorTree> DT;
596  std::unique_ptr<PostDominatorTree> PDT;
597 
598  // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
599  // reduces compile-time significantly. TODO: When we *do* use BFI, we should
600  // be able to salvage its domtrees instead of recomputing them.
601  BlockFrequencyInfo *BFI = nullptr;
602  if (HasProfileSummary)
603  BFI = GetBFI(F);
604 
605  TargetTransformInfo &TTI = GetTTI(F);
606  OptimizationRemarkEmitter &ORE = (*GetORE)(F);
607  AssumptionCache *AC = LookupAC(F);
608 
609  // Find all cold regions.
610  for (BasicBlock *BB : RPOT) {
611  // This block is already part of some outlining region.
612  if (ColdBlocks.count(BB))
613  continue;
614 
615  bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
616  (EnableStaticAnalysis && unlikelyExecuted(*BB));
617  if (!Cold)
618  continue;
619 
620  LLVM_DEBUG({
621  dbgs() << "Found a cold block:\n";
622  BB->dump();
623  });
624 
625  if (!DT)
626  DT = std::make_unique<DominatorTree>(F);
627  if (!PDT)
628  PDT = std::make_unique<PostDominatorTree>(F);
629 
630  auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
631  for (OutliningRegion &Region : Regions) {
632  if (Region.empty())
633  continue;
634 
635  if (Region.isEntireFunctionCold()) {
636  LLVM_DEBUG(dbgs() << "Entire function is cold\n");
637  return markFunctionCold(F);
638  }
639 
640  // If this outlining region intersects with another, drop the new region.
641  //
642  // TODO: It's theoretically possible to outline more by only keeping the
643  // largest region which contains a block, but the extra bookkeeping to do
644  // this is tricky/expensive.
645  bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
646  return !ColdBlocks.insert(Block.first).second;
647  });
648  if (RegionsOverlap)
649  continue;
650 
651  OutliningWorklist.emplace_back(std::move(Region));
652  ++NumColdRegionsFound;
653  }
654  }
655 
656  if (OutliningWorklist.empty())
657  return Changed;
658 
659  // Outline single-entry cold regions, splitting up larger regions as needed.
660  unsigned OutlinedFunctionID = 1;
661  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
663  do {
664  OutliningRegion Region = OutliningWorklist.pop_back_val();
665  assert(!Region.empty() && "Empty outlining region in worklist");
666  do {
667  BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
668  LLVM_DEBUG({
669  dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
670  for (BasicBlock *BB : SubRegion)
671  BB->dump();
672  });
673 
674  Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
675  ORE, AC, OutlinedFunctionID);
676  if (Outlined) {
677  ++OutlinedFunctionID;
678  Changed = true;
679  }
680  } while (!Region.empty());
681  } while (!OutliningWorklist.empty());
682 
683  return Changed;
684 }
685 
687  bool Changed = false;
688  bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
689  for (Function &F : M) {
690  // Do not touch declarations.
691  if (F.isDeclaration())
692  continue;
693 
694  // Do not modify `optnone` functions.
695  if (F.hasOptNone())
696  continue;
697 
698  // Detect inherently cold functions and mark them as such.
699  if (isFunctionCold(F)) {
700  Changed |= markFunctionCold(F);
701  continue;
702  }
703 
704  if (!shouldOutlineFrom(F)) {
705  LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
706  continue;
707  }
708 
709  LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
710  Changed |= outlineColdRegions(F, HasProfileSummary);
711  }
712  return Changed;
713 }
714 
715 bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
716  if (skipModule(M))
717  return false;
718  ProfileSummaryInfo *PSI =
719  &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
720  auto GTTI = [this](Function &F) -> TargetTransformInfo & {
721  return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
722  };
723  auto GBFI = [this](Function &F) {
724  return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
725  };
726  std::unique_ptr<OptimizationRemarkEmitter> ORE;
728  [&ORE](Function &F) -> OptimizationRemarkEmitter & {
729  ORE.reset(new OptimizationRemarkEmitter(&F));
730  return *ORE;
731  };
732  auto LookupAC = [this](Function &F) -> AssumptionCache * {
733  if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
734  return ACT->lookupAssumptionCache(F);
735  return nullptr;
736  };
737 
738  return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
739 }
740 
743  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
744 
745  auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
747  };
748 
749  auto GBFI = [&FAM](Function &F) {
751  };
752 
754  [&FAM](Function &F) -> TargetTransformInfo & {
755  return FAM.getResult<TargetIRAnalysis>(F);
756  };
757 
758  std::unique_ptr<OptimizationRemarkEmitter> ORE;
760  [&ORE](Function &F) -> OptimizationRemarkEmitter & {
761  ORE.reset(new OptimizationRemarkEmitter(&F));
762  return *ORE;
763  };
764 
766 
767  if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
768  return PreservedAnalyses::none();
769  return PreservedAnalyses::all();
770 }
771 
773 INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
774  "Hot Cold Splitting", false, false)
777 INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
779 
781  return new HotColdSplittingLegacyPass();
782 }
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
AssumptionCache.h
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2458
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:735
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:724
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:248
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:780
llvm::Function
Definition: Function.h:60
BlockTy
std::pair< BasicBlock *, unsigned > BlockTy
A pair of (basic block, score).
Definition: HotColdSplitting.cpp:397
Pass.h
llvm::BlockFrequencyInfoWrapperPass
Legacy analysis pass which computes BlockFrequencyInfo.
Definition: BlockFrequencyInfo.h:138
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::GlobalObject::getSection
StringRef getSection() const
Get the custom section of this global if it has one.
Definition: GlobalObject.h:111
llvm::CallBase::setIsNoInline
void setIsNoInline()
Definition: InstrTypes.h:1849
ColdSectionName
static cl::opt< std::string > ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"), cl::Hidden, cl::desc("Name for the section containing cold functions " "extracted by hot-cold splitting."))
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
Statistic.h
llvm::HotColdSplittingPass::run
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Definition: HotColdSplitting.cpp:742
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:167
llvm::df_end
df_iterator< T > df_end(const T &G)
Definition: DepthFirstIterator.h:224
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
FAM
FunctionAnalysisManager FAM
Definition: PassBuilderBindings.cpp:59
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:213
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
llvm::TargetTransformInfo::useColdCCForColdCall
bool useColdCCForColdCall(Function &F) const
Return true if the input function which is cold at all call sites, should use coldcc calling conventi...
Definition: TargetTransformInfo.cpp:493
Module.h
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::ProfileSummaryInfo::isFunctionEntryCold
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
Definition: ProfileSummaryInfo.cpp:221
llvm::HotColdSplitting
Definition: HotColdSplitting.h:33
HotColdSplitting.h
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
CodeExtractor.h
llvm::successors
auto successors(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:29
llvm::CallingConv::Cold
@ Cold
Definition: CallingConv.h:48
llvm::HotColdSplitting::run
bool run(Module &M)
Definition: HotColdSplitting.cpp:686
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:654
llvm::succ_empty
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
llvm::idf_begin
idf_iterator< T > idf_begin(const T &G)
Definition: DepthFirstIterator.h:268
llvm::remove_if
auto remove_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1644
Instruction.h
CommandLine.h
SplittingThreshold
static cl::opt< int > SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden, cl::desc("Base penalty for splitting cold code (as a " "multiple of TCC_Basic)"))
llvm::idf_end
idf_iterator< T > idf_end(const T &G)
Definition: DepthFirstIterator.h:273
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
PostDominators.h
llvm::User
Definition: User.h:44
DEBUG_TYPE
#define DEBUG_TYPE
Definition: HotColdSplitting.cpp:61
hotcoldsplit
hotcoldsplit
Definition: HotColdSplitting.cpp:777
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
false
Definition: StackSlotColoring.cpp:141
EnableStaticAnalysis
static cl::opt< bool > EnableStaticAnalysis("hot-cold-static-analysis", cl::init(true), cl::Hidden)
llvm::Instruction
Definition: Instruction.h:42
llvm::BlockFrequencyAnalysis
Analysis pass which computes BlockFrequencyInfo.
Definition: BlockFrequencyInfo.h:112
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::createHotColdSplittingPass
ModulePass * createHotColdSplittingPass()
createHotColdSplittingPass - This pass outlines cold blocks into a separate function(s).
Definition: HotColdSplitting.cpp:780
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
CFG.h
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:39
Splitting
Hot Cold Splitting
Definition: HotColdSplitting.cpp:778
llvm::RegionBase::blocks
block_range blocks()
Returns a range view of the basic blocks in the region.
Definition: RegionInfo.h:622
BasicBlock.h
llvm::cl::opt< bool >
llvm::GlobalObject::hasSection
bool hasSection() const
Check if this global has a custom object file section.
Definition: GlobalObject.h:103
getOutliningBenefit
static InstructionCost getOutliningBenefit(ArrayRef< BasicBlock * > Region, TargetTransformInfo &TTI)
Get the benefit score of outlining Region.
Definition: HotColdSplitting.cpp:228
llvm::initializeHotColdSplittingLegacyPassPass
void initializeHotColdSplittingLegacyPassPass(PassRegistry &)
ProfileSummaryInfo.h
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2514
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:173
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IPO.h
EnableColdSection
static cl::opt< bool > EnableColdSection("enable-cold-section", cl::init(false), cl::Hidden, cl::desc("Enable placement of extracted cold functions" " into a separate section after hot-cold splitting."))
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit", "Hot Cold Splitting", false, false) INITIALIZE_PASS_END(HotColdSplittingLegacyPass
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:193
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1670
llvm::df_begin
df_iterator< T > df_begin(const T &G)
Definition: DepthFirstIterator.h:219
getOutliningPenalty
static int getOutliningPenalty(ArrayRef< BasicBlock * > Region, unsigned NumInputs, unsigned NumOutputs)
Get the penalty score for outlining Region.
Definition: HotColdSplitting.cpp:243
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::ProfileSummaryAnalysis
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:211
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1612
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::PostDominatorTree
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Definition: PostDominators.h:28
llvm::CodeExtractor
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
llvm::pred_empty
bool pred_empty(const BasicBlock *BB)
Definition: CFG.h:118
llvm::InstructionCost::isValid
bool isValid() const
Definition: InstructionCost.h:79
BlockFrequencyInfo.h
llvm::AMDGPUISD::BFI
@ BFI
Definition: AMDGPUISelLowering.h:429
STATISTIC
STATISTIC(NumColdRegionsFound, "Number of cold regions found.")
llvm::empty
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:268
llvm::M68kBeads::Term
@ Term
Definition: M68kBaseInfo.h:71
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
llvm::AnalysisManager::getCachedResult
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:799
DiagnosticInfo.h
Function.h
PassManager.h
llvm::TargetTransformInfo::getInstructionCost
InstructionCost getInstructionCost(const Instruction *I, enum TargetCostKind kind) const
Query the cost of a specified instruction.
Definition: TargetTransformInfo.h:224
llvm::CodeExtractorAnalysisCache
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:291
llvm::ProfileSummaryInfo::isColdBlock
bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const
Returns true if BasicBlock BB is considered cold.
Definition: ProfileSummaryInfo.cpp:331
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:690
Instructions.h
MaxParametersForSplit
static cl::opt< int > MaxParametersForSplit("hotcoldsplit-max-params", cl::init(4), cl::Hidden, cl::desc("Maximum number of parameters for a split function"))
PostOrderIterator.h
SmallVector.h
User.h
Dominators.h
llvm::to_string
std::string to_string(const T &Value)
Definition: ScopedPrinter.h:86
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
TargetTransformInfo.h
llvm::PHINode
Definition: Instructions.h:2664
blockEndsInUnreachable
static bool blockEndsInUnreachable(const MachineBasicBlock *MBB)
A no successor, non-return block probably ends in unreachable and is cold.
Definition: BranchFolding.cpp:514
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:42
llvm::AnalysisUsage::addUsedIfAvailable
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
Definition: PassAnalysisSupport.h:117
llvm::InnerAnalysisManagerProxy
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:937
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:262
llvm::cl::desc
Definition: CommandLine.h:405
llvm::Region
Definition: RegionInfo.h:889
raw_ostream.h
llvm::SetVector< Value * >
llvm::PostDominatorTree::dominates
bool dominates(const Instruction *I1, const Instruction *I2) const
Return true if I1 dominates I2.
Definition: PostDominators.cpp:54
Value.h
InitializePasses.h
Debug.h
llvm::CallBase::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: InstrTypes.h:1459
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:927
llvm::RegionBase::getParent
RegionT * getParent() const
Get the parent of the Region.
Definition: RegionInfo.h:364
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37