LLVM  14.0.0git
HotColdSplitting.cpp
Go to the documentation of this file.
1 //===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// The goal of hot/cold splitting is to improve the memory locality of code.
11 /// The splitting pass does this by identifying cold blocks and moving them into
12 /// separate functions.
13 ///
14 /// When the splitting pass finds a cold block (referred to as "the sink"), it
15 /// grows a maximal cold region around that block. The maximal region contains
16 /// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
17 /// cold as the sink. Once a region is found, it's split out of the original
18 /// function provided it's profitable to do so.
19 ///
20 /// [*] In practice, there is some added complexity because some blocks are not
21 /// safe to extract.
22 ///
23 /// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
24 /// TODO: Reorder outlined functions.
25 ///
26 //===----------------------------------------------------------------------===//
27 
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
34 #include "llvm/Analysis/CFG.h"
39 #include "llvm/IR/BasicBlock.h"
40 #include "llvm/IR/CFG.h"
41 #include "llvm/IR/DataLayout.h"
42 #include "llvm/IR/DiagnosticInfo.h"
43 #include "llvm/IR/Dominators.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/Instruction.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/IR/Metadata.h"
49 #include "llvm/IR/Module.h"
50 #include "llvm/IR/PassManager.h"
51 #include "llvm/IR/Type.h"
52 #include "llvm/IR/Use.h"
53 #include "llvm/IR/User.h"
54 #include "llvm/IR/Value.h"
55 #include "llvm/InitializePasses.h"
56 #include "llvm/Pass.h"
60 #include "llvm/Support/Debug.h"
62 #include "llvm/Transforms/IPO.h"
63 #include "llvm/Transforms/Scalar.h"
69 #include <algorithm>
70 #include <limits>
71 #include <cassert>
72 #include <string>
73 
74 #define DEBUG_TYPE "hotcoldsplit"
75 
76 STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
77 STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
78 
79 using namespace llvm;
80 
81 static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
82  cl::init(true), cl::Hidden);
83 
84 static cl::opt<int>
85  SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
86  cl::desc("Base penalty for splitting cold code (as a "
87  "multiple of TCC_Basic)"));
88 
90  "enable-cold-section", cl::init(false), cl::Hidden,
91  cl::desc("Enable placement of extracted cold functions"
92  " into a separate section after hot-cold splitting."));
93 
95  ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"),
96  cl::Hidden,
97  cl::desc("Name for the section containing cold functions "
98  "extracted by hot-cold splitting."));
99 
101  "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
102  cl::desc("Maximum number of parameters for a split function"));
103 
104 namespace {
105 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
106 // this function unless you modify the MBB version as well.
107 //
108 /// A no successor, non-return block probably ends in unreachable and is cold.
109 /// Also consider a block that ends in an indirect branch to be a return block,
110 /// since many targets use plain indirect branches to return.
111 bool blockEndsInUnreachable(const BasicBlock &BB) {
112  if (!succ_empty(&BB))
113  return false;
114  if (BB.empty())
115  return true;
116  const Instruction *I = BB.getTerminator();
117  return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
118 }
119 
120 bool unlikelyExecuted(BasicBlock &BB) {
121  // Exception handling blocks are unlikely executed.
122  if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
123  return true;
124 
125  // The block is cold if it calls/invokes a cold function. However, do not
126  // mark sanitizer traps as cold.
127  for (Instruction &I : BB)
128  if (auto *CB = dyn_cast<CallBase>(&I))
129  if (CB->hasFnAttr(Attribute::Cold) && !CB->getMetadata("nosanitize"))
130  return true;
131 
132  // The block is cold if it has an unreachable terminator, unless it's
133  // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
134  if (blockEndsInUnreachable(BB)) {
135  if (auto *CI =
136  dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
137  if (CI->hasFnAttr(Attribute::NoReturn))
138  return false;
139  return true;
140  }
141 
142  return false;
143 }
144 
145 /// Check whether it's safe to outline \p BB.
146 static bool mayExtractBlock(const BasicBlock &BB) {
147  // EH pads are unsafe to outline because doing so breaks EH type tables. It
148  // follows that invoke instructions cannot be extracted, because CodeExtractor
149  // requires unwind destinations to be within the extraction region.
150  //
151  // Resumes that are not reachable from a cleanup landing pad are considered to
152  // be unreachable. It’s not safe to split them out either.
153  if (BB.hasAddressTaken() || BB.isEHPad())
154  return false;
155  auto Term = BB.getTerminator();
156  return !isa<InvokeInst>(Term) && !isa<ResumeInst>(Term);
157 }
158 
159 /// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
160 /// If \p UpdateEntryCount is true (set when this is a new split function and
161 /// module has profile data), set entry count to 0 to ensure treated as cold.
162 /// Return true if the function is changed.
163 static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
164  assert(!F.hasOptNone() && "Can't mark this cold");
165  bool Changed = false;
166  if (!F.hasFnAttribute(Attribute::Cold)) {
167  F.addFnAttr(Attribute::Cold);
168  Changed = true;
169  }
170  if (!F.hasFnAttribute(Attribute::MinSize)) {
171  F.addFnAttr(Attribute::MinSize);
172  Changed = true;
173  }
174  if (UpdateEntryCount) {
175  // Set the entry count to 0 to ensure it is placed in the unlikely text
176  // section when function sections are enabled.
177  F.setEntryCount(0);
178  Changed = true;
179  }
180 
181  return Changed;
182 }
183 
184 class HotColdSplittingLegacyPass : public ModulePass {
185 public:
186  static char ID;
187  HotColdSplittingLegacyPass() : ModulePass(ID) {
189  }
190 
191  void getAnalysisUsage(AnalysisUsage &AU) const override {
196  }
197 
198  bool runOnModule(Module &M) override;
199 };
200 
201 } // end anonymous namespace
202 
203 /// Check whether \p F is inherently cold.
204 bool HotColdSplitting::isFunctionCold(const Function &F) const {
205  if (F.hasFnAttribute(Attribute::Cold))
206  return true;
207 
208  if (F.getCallingConv() == CallingConv::Cold)
209  return true;
210 
211  if (PSI->isFunctionEntryCold(&F))
212  return true;
213 
214  return false;
215 }
216 
217 // Returns false if the function should not be considered for hot-cold split
218 // optimization.
219 bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
220  if (F.hasFnAttribute(Attribute::AlwaysInline))
221  return false;
222 
223  if (F.hasFnAttribute(Attribute::NoInline))
224  return false;
225 
226  // A function marked `noreturn` may contain unreachable terminators: these
227  // should not be considered cold, as the function may be a trampoline.
228  if (F.hasFnAttribute(Attribute::NoReturn))
229  return false;
230 
231  if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
232  F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
233  F.hasFnAttribute(Attribute::SanitizeThread) ||
234  F.hasFnAttribute(Attribute::SanitizeMemory))
235  return false;
236 
237  return true;
238 }
239 
240 /// Get the benefit score of outlining \p Region.
243  // Sum up the code size costs of non-terminator instructions. Tight coupling
244  // with \ref getOutliningPenalty is needed to model the costs of terminators.
245  InstructionCost Benefit = 0;
246  for (BasicBlock *BB : Region)
247  for (Instruction &I : BB->instructionsWithoutDebug())
248  if (&I != BB->getTerminator())
249  Benefit +=
251 
252  return Benefit;
253 }
254 
255 /// Get the penalty score for outlining \p Region.
257  unsigned NumInputs, unsigned NumOutputs) {
258  int Penalty = SplittingThreshold;
259  LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
260 
261  // If the splitting threshold is set at or below zero, skip the usual
262  // profitability check.
263  if (SplittingThreshold <= 0)
264  return Penalty;
265 
266  // Find the number of distinct exit blocks for the region. Use a conservative
267  // check to determine whether control returns from the region.
268  bool NoBlocksReturn = true;
269  SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
270  for (BasicBlock *BB : Region) {
271  // If a block has no successors, only assume it does not return if it's
272  // unreachable.
273  if (succ_empty(BB)) {
274  NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
275  continue;
276  }
277 
278  for (BasicBlock *SuccBB : successors(BB)) {
279  if (!is_contained(Region, SuccBB)) {
280  NoBlocksReturn = false;
281  SuccsOutsideRegion.insert(SuccBB);
282  }
283  }
284  }
285 
286  // Count the number of phis in exit blocks with >= 2 incoming values from the
287  // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
288  // and new outputs are created to supply the split phis. CodeExtractor can't
289  // report these new outputs until extraction begins, but it's important to
290  // factor the cost of the outputs into the cost calculation.
291  unsigned NumSplitExitPhis = 0;
292  for (BasicBlock *ExitBB : SuccsOutsideRegion) {
293  for (PHINode &PN : ExitBB->phis()) {
294  // Find all incoming values from the outlining region.
295  int NumIncomingVals = 0;
296  for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
297  if (find(Region, PN.getIncomingBlock(i)) != Region.end()) {
298  ++NumIncomingVals;
299  if (NumIncomingVals > 1) {
300  ++NumSplitExitPhis;
301  break;
302  }
303  }
304  }
305  }
306 
307  // Apply a penalty for calling the split function. Factor in the cost of
308  // materializing all of the parameters.
309  int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
310  int NumParams = NumInputs + NumOutputsAndSplitPhis;
311  if (NumParams > MaxParametersForSplit) {
312  LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
313  << " outputs exceeds parameter limit ("
314  << MaxParametersForSplit << ")\n");
316  }
317  const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
318  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
319  Penalty += CostForArgMaterialization * NumParams;
320 
321  // Apply the typical code size cost for an output alloca and its associated
322  // reload in the caller. Also penalize the associated store in the callee.
323  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
324  << " outputs/split phis\n");
325  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
326  Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
327 
328  // Apply a `noreturn` bonus.
329  if (NoBlocksReturn) {
330  LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
331  << " non-returning terminators\n");
332  Penalty -= Region.size();
333  }
334 
335  // Apply a penalty for having more than one successor outside of the region.
336  // This penalty accounts for the switch needed in the caller.
337  if (SuccsOutsideRegion.size() > 1) {
338  LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
339  << " non-region successors\n");
340  Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
341  }
342 
343  return Penalty;
344 }
345 
346 Function *HotColdSplitting::extractColdRegion(
347  const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
349  OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
350  assert(!Region.empty());
351 
352  // TODO: Pass BFI and BPI to update profile information.
353  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
354  /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
355  /* AllowAlloca */ false,
356  /* Suffix */ "cold." + std::to_string(Count));
357 
358  // Perform a simple cost/benefit analysis to decide whether or not to permit
359  // splitting.
360  SetVector<Value *> Inputs, Outputs, Sinks;
361  CE.findInputsOutputs(Inputs, Outputs, Sinks);
362  InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
363  int OutliningPenalty =
364  getOutliningPenalty(Region, Inputs.size(), Outputs.size());
365  LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
366  << ", penalty = " << OutliningPenalty << "\n");
367  if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
368  return nullptr;
369 
370  Function *OrigF = Region[0]->getParent();
371  if (Function *OutF = CE.extractCodeRegion(CEAC)) {
372  User *U = *OutF->user_begin();
373  CallInst *CI = cast<CallInst>(U);
374  NumColdRegionsOutlined++;
375  if (TTI.useColdCCForColdCall(*OutF)) {
376  OutF->setCallingConv(CallingConv::Cold);
378  }
379  CI->setIsNoInline();
380 
381  if (EnableColdSection)
382  OutF->setSection(ColdSectionName);
383  else {
384  if (OrigF->hasSection())
385  OutF->setSection(OrigF->getSection());
386  }
387 
388  markFunctionCold(*OutF, BFI != nullptr);
389 
390  LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
391  ORE.emit([&]() {
392  return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
393  &*Region[0]->begin())
394  << ore::NV("Original", OrigF) << " split cold code into "
395  << ore::NV("Split", OutF);
396  });
397  return OutF;
398  }
399 
400  ORE.emit([&]() {
401  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
402  &*Region[0]->begin())
403  << "Failed to extract region at block "
404  << ore::NV("Block", Region.front());
405  });
406  return nullptr;
407 }
408 
409 /// A pair of (basic block, score).
410 using BlockTy = std::pair<BasicBlock *, unsigned>;
411 
412 namespace {
413 /// A maximal outlining region. This contains all blocks post-dominated by a
414 /// sink block, the sink block itself, and all blocks dominated by the sink.
415 /// If sink-predecessors and sink-successors cannot be extracted in one region,
416 /// the static constructor returns a list of suitable extraction regions.
417 class OutliningRegion {
418  /// A list of (block, score) pairs. A block's score is non-zero iff it's a
419  /// viable sub-region entry point. Blocks with higher scores are better entry
420  /// points (i.e. they are more distant ancestors of the sink block).
421  SmallVector<BlockTy, 0> Blocks = {};
422 
423  /// The suggested entry point into the region. If the region has multiple
424  /// entry points, all blocks within the region may not be reachable from this
425  /// entry point.
426  BasicBlock *SuggestedEntryPoint = nullptr;
427 
428  /// Whether the entire function is cold.
429  bool EntireFunctionCold = false;
430 
431  /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
432  static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
433  return mayExtractBlock(BB) ? Score : 0;
434  }
435 
436  /// These scores should be lower than the score for predecessor blocks,
437  /// because regions starting at predecessor blocks are typically larger.
438  static constexpr unsigned ScoreForSuccBlock = 1;
439  static constexpr unsigned ScoreForSinkBlock = 1;
440 
441  OutliningRegion(const OutliningRegion &) = delete;
442  OutliningRegion &operator=(const OutliningRegion &) = delete;
443 
444 public:
445  OutliningRegion() = default;
446  OutliningRegion(OutliningRegion &&) = default;
447  OutliningRegion &operator=(OutliningRegion &&) = default;
448 
449  static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
450  const DominatorTree &DT,
451  const PostDominatorTree &PDT) {
452  std::vector<OutliningRegion> Regions;
453  SmallPtrSet<BasicBlock *, 4> RegionBlocks;
454 
455  Regions.emplace_back();
456  OutliningRegion *ColdRegion = &Regions.back();
457 
458  auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
459  RegionBlocks.insert(BB);
460  ColdRegion->Blocks.emplace_back(BB, Score);
461  };
462 
463  // The ancestor farthest-away from SinkBB, and also post-dominated by it.
464  unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
465  ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
466  unsigned BestScore = SinkScore;
467 
468  // Visit SinkBB's ancestors using inverse DFS.
469  auto PredIt = ++idf_begin(&SinkBB);
470  auto PredEnd = idf_end(&SinkBB);
471  while (PredIt != PredEnd) {
472  BasicBlock &PredBB = **PredIt;
473  bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
474 
475  // If the predecessor is cold and has no predecessors, the entire
476  // function must be cold.
477  if (SinkPostDom && pred_empty(&PredBB)) {
478  ColdRegion->EntireFunctionCold = true;
479  return Regions;
480  }
481 
482  // If SinkBB does not post-dominate a predecessor, do not mark the
483  // predecessor (or any of its predecessors) cold.
484  if (!SinkPostDom || !mayExtractBlock(PredBB)) {
485  PredIt.skipChildren();
486  continue;
487  }
488 
489  // Keep track of the post-dominated ancestor farthest away from the sink.
490  // The path length is always >= 2, ensuring that predecessor blocks are
491  // considered as entry points before the sink block.
492  unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
493  if (PredScore > BestScore) {
494  ColdRegion->SuggestedEntryPoint = &PredBB;
495  BestScore = PredScore;
496  }
497 
498  addBlockToRegion(&PredBB, PredScore);
499  ++PredIt;
500  }
501 
502  // If the sink can be added to the cold region, do so. It's considered as
503  // an entry point before any sink-successor blocks.
504  //
505  // Otherwise, split cold sink-successor blocks using a separate region.
506  // This satisfies the requirement that all extraction blocks other than the
507  // first have predecessors within the extraction region.
508  if (mayExtractBlock(SinkBB)) {
509  addBlockToRegion(&SinkBB, SinkScore);
510  if (pred_empty(&SinkBB)) {
511  ColdRegion->EntireFunctionCold = true;
512  return Regions;
513  }
514  } else {
515  Regions.emplace_back();
516  ColdRegion = &Regions.back();
517  BestScore = 0;
518  }
519 
520  // Find all successors of SinkBB dominated by SinkBB using DFS.
521  auto SuccIt = ++df_begin(&SinkBB);
522  auto SuccEnd = df_end(&SinkBB);
523  while (SuccIt != SuccEnd) {
524  BasicBlock &SuccBB = **SuccIt;
525  bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
526 
527  // Don't allow the backwards & forwards DFSes to mark the same block.
528  bool DuplicateBlock = RegionBlocks.count(&SuccBB);
529 
530  // If SinkBB does not dominate a successor, do not mark the successor (or
531  // any of its successors) cold.
532  if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
533  SuccIt.skipChildren();
534  continue;
535  }
536 
537  unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
538  if (SuccScore > BestScore) {
539  ColdRegion->SuggestedEntryPoint = &SuccBB;
540  BestScore = SuccScore;
541  }
542 
543  addBlockToRegion(&SuccBB, SuccScore);
544  ++SuccIt;
545  }
546 
547  return Regions;
548  }
549 
550  /// Whether this region has nothing to extract.
551  bool empty() const { return !SuggestedEntryPoint; }
552 
553  /// The blocks in this region.
554  ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
555 
556  /// Whether the entire function containing this region is cold.
557  bool isEntireFunctionCold() const { return EntireFunctionCold; }
558 
559  /// Remove a sub-region from this region and return it as a block sequence.
560  BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
561  assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
562 
563  // Remove blocks dominated by the suggested entry point from this region.
564  // During the removal, identify the next best entry point into the region.
565  // Ensure that the first extracted block is the suggested entry point.
566  BlockSequence SubRegion = {SuggestedEntryPoint};
567  BasicBlock *NextEntryPoint = nullptr;
568  unsigned NextScore = 0;
569  auto RegionEndIt = Blocks.end();
570  auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
571  BasicBlock *BB = Block.first;
572  unsigned Score = Block.second;
573  bool InSubRegion =
574  BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
575  if (!InSubRegion && Score > NextScore) {
576  NextEntryPoint = BB;
577  NextScore = Score;
578  }
579  if (InSubRegion && BB != SuggestedEntryPoint)
580  SubRegion.push_back(BB);
581  return InSubRegion;
582  });
583  Blocks.erase(RegionStartIt, RegionEndIt);
584 
585  // Update the suggested entry point.
586  SuggestedEntryPoint = NextEntryPoint;
587 
588  return SubRegion;
589  }
590 };
591 } // namespace
592 
593 bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
594  bool Changed = false;
595 
596  // The set of cold blocks.
597  SmallPtrSet<BasicBlock *, 4> ColdBlocks;
598 
599  // The worklist of non-intersecting regions left to outline.
600  SmallVector<OutliningRegion, 2> OutliningWorklist;
601 
602  // Set up an RPO traversal. Experimentally, this performs better (outlines
603  // more) than a PO traversal, because we prevent region overlap by keeping
604  // the first region to contain a block.
606 
607  // Calculate domtrees lazily. This reduces compile-time significantly.
608  std::unique_ptr<DominatorTree> DT;
609  std::unique_ptr<PostDominatorTree> PDT;
610 
611  // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
612  // reduces compile-time significantly. TODO: When we *do* use BFI, we should
613  // be able to salvage its domtrees instead of recomputing them.
614  BlockFrequencyInfo *BFI = nullptr;
615  if (HasProfileSummary)
616  BFI = GetBFI(F);
617 
618  TargetTransformInfo &TTI = GetTTI(F);
619  OptimizationRemarkEmitter &ORE = (*GetORE)(F);
620  AssumptionCache *AC = LookupAC(F);
621 
622  // Find all cold regions.
623  for (BasicBlock *BB : RPOT) {
624  // This block is already part of some outlining region.
625  if (ColdBlocks.count(BB))
626  continue;
627 
628  bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
629  (EnableStaticAnalysis && unlikelyExecuted(*BB));
630  if (!Cold)
631  continue;
632 
633  LLVM_DEBUG({
634  dbgs() << "Found a cold block:\n";
635  BB->dump();
636  });
637 
638  if (!DT)
639  DT = std::make_unique<DominatorTree>(F);
640  if (!PDT)
641  PDT = std::make_unique<PostDominatorTree>(F);
642 
643  auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
644  for (OutliningRegion &Region : Regions) {
645  if (Region.empty())
646  continue;
647 
648  if (Region.isEntireFunctionCold()) {
649  LLVM_DEBUG(dbgs() << "Entire function is cold\n");
650  return markFunctionCold(F);
651  }
652 
653  // If this outlining region intersects with another, drop the new region.
654  //
655  // TODO: It's theoretically possible to outline more by only keeping the
656  // largest region which contains a block, but the extra bookkeeping to do
657  // this is tricky/expensive.
658  bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
659  return !ColdBlocks.insert(Block.first).second;
660  });
661  if (RegionsOverlap)
662  continue;
663 
664  OutliningWorklist.emplace_back(std::move(Region));
665  ++NumColdRegionsFound;
666  }
667  }
668 
669  if (OutliningWorklist.empty())
670  return Changed;
671 
672  // Outline single-entry cold regions, splitting up larger regions as needed.
673  unsigned OutlinedFunctionID = 1;
674  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
676  do {
677  OutliningRegion Region = OutliningWorklist.pop_back_val();
678  assert(!Region.empty() && "Empty outlining region in worklist");
679  do {
680  BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
681  LLVM_DEBUG({
682  dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
683  for (BasicBlock *BB : SubRegion)
684  BB->dump();
685  });
686 
687  Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
688  ORE, AC, OutlinedFunctionID);
689  if (Outlined) {
690  ++OutlinedFunctionID;
691  Changed = true;
692  }
693  } while (!Region.empty());
694  } while (!OutliningWorklist.empty());
695 
696  return Changed;
697 }
698 
700  bool Changed = false;
701  bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
702  for (Function &F : M) {
703  // Do not touch declarations.
704  if (F.isDeclaration())
705  continue;
706 
707  // Do not modify `optnone` functions.
708  if (F.hasOptNone())
709  continue;
710 
711  // Detect inherently cold functions and mark them as such.
712  if (isFunctionCold(F)) {
713  Changed |= markFunctionCold(F);
714  continue;
715  }
716 
717  if (!shouldOutlineFrom(F)) {
718  LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
719  continue;
720  }
721 
722  LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
723  Changed |= outlineColdRegions(F, HasProfileSummary);
724  }
725  return Changed;
726 }
727 
728 bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
729  if (skipModule(M))
730  return false;
731  ProfileSummaryInfo *PSI =
732  &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
733  auto GTTI = [this](Function &F) -> TargetTransformInfo & {
734  return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
735  };
736  auto GBFI = [this](Function &F) {
737  return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
738  };
739  std::unique_ptr<OptimizationRemarkEmitter> ORE;
741  [&ORE](Function &F) -> OptimizationRemarkEmitter & {
742  ORE.reset(new OptimizationRemarkEmitter(&F));
743  return *ORE.get();
744  };
745  auto LookupAC = [this](Function &F) -> AssumptionCache * {
746  if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
747  return ACT->lookupAssumptionCache(F);
748  return nullptr;
749  };
750 
751  return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
752 }
753 
756  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
757 
758  auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
760  };
761 
762  auto GBFI = [&FAM](Function &F) {
764  };
765 
767  [&FAM](Function &F) -> TargetTransformInfo & {
768  return FAM.getResult<TargetIRAnalysis>(F);
769  };
770 
771  std::unique_ptr<OptimizationRemarkEmitter> ORE;
773  [&ORE](Function &F) -> OptimizationRemarkEmitter & {
774  ORE.reset(new OptimizationRemarkEmitter(&F));
775  return *ORE.get();
776  };
777 
779 
780  if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
781  return PreservedAnalyses::none();
782  return PreservedAnalyses::all();
783 }
784 
786 INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
787  "Hot Cold Splitting", false, false)
790 INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
792 
794  return new HotColdSplittingLegacyPass();
795 }
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2331
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:729
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:705
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
ValueMapper.h
llvm::RegionBase::end
iterator end()
Definition: RegionInfo.h:561
Metadata.h
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:238
IntrinsicInst.h
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:779
Scalar.h
llvm::Function
Definition: Function.h:61
BlockTy
std::pair< BasicBlock *, unsigned > BlockTy
A pair of (basic block, score).
Definition: HotColdSplitting.cpp:410
Pass.h
llvm::BlockFrequencyInfoWrapperPass
Legacy analysis pass which computes BlockFrequencyInfo.
Definition: BlockFrequencyInfo.h:138
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::GlobalObject::getSection
StringRef getSection() const
Get the custom section of this global if it has one.
Definition: GlobalObject.h:112
llvm::CallBase::setIsNoInline
void setIsNoInline()
Definition: InstrTypes.h:1809
ColdSectionName
static cl::opt< std::string > ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"), cl::Hidden, cl::desc("Name for the section containing cold functions " "extracted by hot-cold splitting."))
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
Statistic.h
llvm::HotColdSplittingPass::run
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Definition: HotColdSplitting.cpp:755
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:168
llvm::df_end
df_iterator< T > df_end(const T &G)
Definition: DepthFirstIterator.h:223
Local.h
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
FAM
FunctionAnalysisManager FAM
Definition: PassBuilderBindings.cpp:59
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:158
BlockFrequency.h
llvm::TargetTransformInfo::useColdCCForColdCall
bool useColdCCForColdCall(Function &F) const
Return true if the input function which is cold at all call sites, should use coldcc calling conventi...
Definition: TargetTransformInfo.cpp:475
Module.h
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
llvm::ProfileSummaryInfo::isFunctionEntryCold
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
Definition: ProfileSummaryInfo.cpp:222
llvm::HotColdSplitting
Definition: HotColdSplitting.h:33
HotColdSplitting.h
llvm::CallingConv::Cold
@ Cold
Definition: CallingConv.h:48
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:398
CodeExtractor.h
llvm::HotColdSplitting::run
bool run(Module &M)
Definition: HotColdSplitting.cpp:699
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:635
llvm::successors
succ_range successors(Instruction *I)
Definition: CFG.h:262
llvm::succ_empty
bool succ_empty(const Instruction *I)
Definition: CFG.h:256
Use.h
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:115
llvm::idf_begin
idf_iterator< T > idf_begin(const T &G)
Definition: DepthFirstIterator.h:267
llvm::remove_if
auto remove_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1590
Instruction.h
CommandLine.h
SplittingThreshold
static cl::opt< int > SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden, cl::desc("Base penalty for splitting cold code (as a " "multiple of TCC_Basic)"))
llvm::idf_end
idf_iterator< T > idf_end(const T &G)
Definition: DepthFirstIterator.h:272
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
PostDominators.h
llvm::User
Definition: User.h:44
DEBUG_TYPE
#define DEBUG_TYPE
Definition: HotColdSplitting.cpp:74
hotcoldsplit
hotcoldsplit
Definition: HotColdSplitting.cpp:790
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
false
Definition: StackSlotColoring.cpp:142
EnableStaticAnalysis
static cl::opt< bool > EnableStaticAnalysis("hot-cold-static-analysis", cl::init(true), cl::Hidden)
llvm::Instruction
Definition: Instruction.h:45
llvm::BlockFrequencyAnalysis
Analysis pass which computes BlockFrequencyInfo.
Definition: BlockFrequencyInfo.h:112
llvm::M68kBeads::Term
@ Term
Definition: M68kBaseInfo.h:71
Type.h
BranchProbability.h
llvm::createHotColdSplittingPass
ModulePass * createHotColdSplittingPass()
createHotColdSplittingPass - This pass outlines cold blocks into a separate function(s).
Definition: HotColdSplitting.cpp:793
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
CFG.h
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:39
Splitting
Hot Cold Splitting
Definition: HotColdSplitting.cpp:791
llvm::RegionBase::blocks
block_range blocks()
Returns a range view of the basic blocks in the region.
Definition: RegionInfo.h:622
BasicBlock.h
llvm::cl::opt< bool >
llvm::GlobalObject::hasSection
bool hasSection() const
Check if this global has a custom object file section.
Definition: GlobalObject.h:104
getOutliningBenefit
static InstructionCost getOutliningBenefit(ArrayRef< BasicBlock * > Region, TargetTransformInfo &TTI)
Get the benefit score of outlining Region.
Definition: HotColdSplitting.cpp:241
BranchProbabilityInfo.h
llvm::initializeHotColdSplittingLegacyPassPass
void initializeHotColdSplittingLegacyPassPass(PassRegistry &)
ProfileSummaryInfo.h
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2387
llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1571
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:169
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IPO.h
EnableColdSection
static cl::opt< bool > EnableColdSection("enable-cold-section", cl::init(false), cl::Hidden, cl::desc("Enable placement of extracted cold functions" " into a separate section after hot-cold splitting."))
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
I
#define I(x, y, z)
Definition: MD5.cpp:59
Cloning.h
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit", "Hot Cold Splitting", false, false) INITIALIZE_PASS_END(HotColdSplittingLegacyPass
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:185
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1616
llvm::df_begin
df_iterator< T > df_begin(const T &G)
Definition: DepthFirstIterator.h:218
getOutliningPenalty
static int getOutliningPenalty(ArrayRef< BasicBlock * > Region, unsigned NumInputs, unsigned NumOutputs)
Get the penalty score for outlining Region.
Definition: HotColdSplitting.cpp:256
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:83
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
CFG.h
llvm::ProfileSummaryAnalysis
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:203
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:200
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1558
DataLayout.h
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:41
llvm::PostDominatorTree
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Definition: PostDominators.h:28
llvm::CodeExtractor
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
llvm::pred_empty
bool pred_empty(const BasicBlock *BB)
Definition: CFG.h:119
llvm::InstructionCost::isValid
bool isValid() const
Definition: InstructionCost.h:79
BlockFrequencyInfo.h
llvm::AMDGPUISD::BFI
@ BFI
Definition: AMDGPUISelLowering.h:421
STATISTIC
STATISTIC(NumColdRegionsFound, "Number of cold regions found.")
llvm::empty
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:254
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::AnalysisManager::getCachedResult
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:798
DiagnosticInfo.h
Function.h
PassManager.h
llvm::TargetTransformInfo::getInstructionCost
InstructionCost getInstructionCost(const Instruction *I, enum TargetCostKind kind) const
Query the cost of a specified instruction.
Definition: TargetTransformInfo.h:225
llvm::CodeExtractorAnalysisCache
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:290
llvm::ProfileSummaryInfo::isColdBlock
bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const
Returns true if BasicBlock BB is considered cold.
Definition: ProfileSummaryInfo.cpp:332
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:684
Instructions.h
MaxParametersForSplit
static cl::opt< int > MaxParametersForSplit("hotcoldsplit-max-params", cl::init(4), cl::Hidden, cl::desc("Maximum number of parameters for a split function"))
PostOrderIterator.h
SmallVector.h
User.h
Dominators.h
llvm::to_string
std::string to_string(const T &Value)
Definition: ScopedPrinter.h:63
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
TargetTransformInfo.h
llvm::PHINode
Definition: Instructions.h:2633
blockEndsInUnreachable
static bool blockEndsInUnreachable(const MachineBasicBlock *MBB)
A no successor, non-return block probably ends in unreachable and is cold.
Definition: BranchFolding.cpp:515
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::AnalysisUsage::addUsedIfAvailable
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
Definition: PassAnalysisSupport.h:117
llvm::InnerAnalysisManagerProxy
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:936
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1475
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:263
llvm::cl::desc
Definition: CommandLine.h:414
llvm::Region
Definition: RegionInfo.h:889
raw_ostream.h
llvm::SetVector< Value * >
BasicBlockUtils.h
llvm::PostDominatorTree::dominates
bool dominates(const Instruction *I1, const Instruction *I2) const
Return true if I1 dominates I2.
Definition: PostDominators.cpp:54
Value.h
InitializePasses.h
Debug.h
llvm::CallBase::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: InstrTypes.h:1453
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:908
llvm::RegionBase::getParent
RegionT * getParent() const
Get the parent of the Region.
Definition: RegionInfo.h:364
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37