LLVM  16.0.0git
PartialInlining.cpp
Go to the documentation of this file.
1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/IR/Attributes.h"
29 #include "llvm/IR/BasicBlock.h"
30 #include "llvm/IR/CFG.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/IR/DiagnosticInfo.h"
33 #include "llvm/IR/Dominators.h"
34 #include "llvm/IR/Function.h"
35 #include "llvm/IR/InstrTypes.h"
36 #include "llvm/IR/Instruction.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Intrinsics.h"
40 #include "llvm/IR/Module.h"
41 #include "llvm/IR/Operator.h"
42 #include "llvm/IR/ProfDataUtils.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
48 #include "llvm/Support/Casting.h"
51 #include "llvm/Transforms/IPO.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <memory>
59 #include <tuple>
60 #include <vector>
61 
62 using namespace llvm;
63 
64 #define DEBUG_TYPE "partial-inlining"
65 
66 STATISTIC(NumPartialInlined,
67  "Number of callsites functions partially inlined into.");
68 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
69  "cold outlined regions were partially "
70  "inlined into its caller(s).");
71 STATISTIC(NumColdRegionsFound,
72  "Number of cold single entry/exit regions found.");
73 STATISTIC(NumColdRegionsOutlined,
74  "Number of cold single entry/exit regions outlined.");
75 
76 // Command line option to disable partial-inlining. The default is false:
77 static cl::opt<bool>
78  DisablePartialInlining("disable-partial-inlining", cl::init(false),
79  cl::Hidden, cl::desc("Disable partial inlining"));
80 // Command line option to disable multi-region partial-inlining. The default is
81 // false:
83  "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
84  cl::desc("Disable multi-region partial inlining"));
85 
86 // Command line option to force outlining in regions with live exit variables.
87 // The default is false:
88 static cl::opt<bool>
89  ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
90  cl::desc("Force outline regions with live exits"));
91 
92 // Command line option to enable marking outline functions with Cold Calling
93 // Convention. The default is false:
94 static cl::opt<bool>
95  MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
96  cl::desc("Mark outline function calls with ColdCC"));
97 
98 // This is an option used by testing:
99 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
100 
102  cl::desc("Skip Cost Analysis"));
103 // Used to determine if a cold region is worth outlining based on
104 // its inlining cost compared to the original function. Default is set at 10%.
105 // ie. if the cold region reduces the inlining cost of the original function by
106 // at least 10%.
108  "min-region-size-ratio", cl::init(0.1), cl::Hidden,
109  cl::desc("Minimum ratio comparing relative sizes of each "
110  "outline candidate and original function"));
111 // Used to tune the minimum number of execution counts needed in the predecessor
112 // block to the cold edge. ie. confidence interval.
113 static cl::opt<unsigned>
114  MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
115  cl::desc("Minimum block executions to consider "
116  "its BranchProbabilityInfo valid"));
117 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
118 // if the branch probability is 10% or less, then it is deemed as 'cold'.
120  "cold-branch-ratio", cl::init(0.1), cl::Hidden,
121  cl::desc("Minimum BranchProbability to consider a region cold."));
122 
124  "max-num-inline-blocks", cl::init(5), cl::Hidden,
125  cl::desc("Max number of blocks to be partially inlined"));
126 
127 // Command line option to set the maximum number of partial inlining allowed
128 // for the module. The default value of -1 means no limit.
130  "max-partial-inlining", cl::init(-1), cl::Hidden,
131  cl::desc("Max number of partial inlining. The default is unlimited"));
132 
133 // Used only when PGO or user annotated branch data is absent. It is
134 // the least value that is used to weigh the outline region. If BFI
135 // produces larger value, the BFI value will be used.
136 static cl::opt<int>
137  OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
138  cl::Hidden,
139  cl::desc("Relative frequency of outline region to "
140  "the entry block"));
141 
143  "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
144  cl::desc("A debug option to add additional penalty to the computed one."));
145 
146 namespace {
147 
148 struct FunctionOutliningInfo {
149  FunctionOutliningInfo() = default;
150 
151  // Returns the number of blocks to be inlined including all blocks
152  // in Entries and one return block.
153  unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
154 
155  // A set of blocks including the function entry that guard
156  // the region to be outlined.
158 
159  // The return block that is not included in the outlined region.
160  BasicBlock *ReturnBlock = nullptr;
161 
162  // The dominating block of the region to be outlined.
163  BasicBlock *NonReturnBlock = nullptr;
164 
165  // The set of blocks in Entries that that are predecessors to ReturnBlock
166  SmallVector<BasicBlock *, 4> ReturnBlockPreds;
167 };
168 
169 struct FunctionOutliningMultiRegionInfo {
170  FunctionOutliningMultiRegionInfo() = default;
171 
172  // Container for outline regions
173  struct OutlineRegionInfo {
174  OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
175  BasicBlock *EntryBlock, BasicBlock *ExitBlock,
176  BasicBlock *ReturnBlock)
177  : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
178  ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
180  BasicBlock *EntryBlock;
181  BasicBlock *ExitBlock;
182  BasicBlock *ReturnBlock;
183  };
184 
186 };
187 
188 struct PartialInlinerImpl {
189 
190  PartialInlinerImpl(
192  function_ref<AssumptionCache *(Function &)> LookupAC,
194  function_ref<const TargetLibraryInfo &(Function &)> GTLI,
195  ProfileSummaryInfo &ProfSI,
196  function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
197  : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
198  GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
199 
200  bool run(Module &M);
201  // Main part of the transformation that calls helper functions to find
202  // outlining candidates, clone & outline the function, and attempt to
203  // partially inline the resulting function. Returns true if
204  // inlining was successful, false otherwise. Also returns the outline
205  // function (only if we partially inlined early returns) as there is a
206  // possibility to further "peel" early return statements that were left in the
207  // outline function due to code size.
208  std::pair<bool, Function *> unswitchFunction(Function &F);
209 
210  // This class speculatively clones the function to be partial inlined.
211  // At the end of partial inlining, the remaining callsites to the cloned
212  // function that are not partially inlined will be fixed up to reference
213  // the original function, and the cloned function will be erased.
214  struct FunctionCloner {
215  // Two constructors, one for single region outlining, the other for
216  // multi-region outlining.
217  FunctionCloner(Function *F, FunctionOutliningInfo *OI,
219  function_ref<AssumptionCache *(Function &)> LookupAC,
221  FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
223  function_ref<AssumptionCache *(Function &)> LookupAC,
225 
226  ~FunctionCloner();
227 
228  // Prepare for function outlining: making sure there is only
229  // one incoming edge from the extracted/outlined region to
230  // the return block.
231  void normalizeReturnBlock() const;
232 
233  // Do function outlining for cold regions.
234  bool doMultiRegionFunctionOutlining();
235  // Do function outlining for region after early return block(s).
236  // NOTE: For vararg functions that do the vararg handling in the outlined
237  // function, we temporarily generate IR that does not properly
238  // forward varargs to the outlined function. Calling InlineFunction
239  // will update calls to the outlined functions to properly forward
240  // the varargs.
241  Function *doSingleRegionFunctionOutlining();
242 
243  Function *OrigFunc = nullptr;
244  Function *ClonedFunc = nullptr;
245 
246  typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
247  // Keep track of Outlined Functions and the basic block they're called from.
248  SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
249 
250  // ClonedFunc is inlined in one of its callers after function
251  // outlining.
252  bool IsFunctionInlined = false;
253  // The cost of the region to be outlined.
254  InstructionCost OutlinedRegionCost = 0;
255  // ClonedOI is specific to outlining non-early return blocks.
256  std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
257  // ClonedOMRI is specific to outlining cold regions.
258  std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
259  std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
261  function_ref<AssumptionCache *(Function &)> LookupAC;
263  };
264 
265 private:
266  int NumPartialInlining = 0;
267  function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
268  function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
271  function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
272  ProfileSummaryInfo &PSI;
273 
274  // Return the frequency of the OutlininingBB relative to F's entry point.
275  // The result is no larger than 1 and is represented using BP.
276  // (Note that the outlined region's 'head' block can only have incoming
277  // edges from the guarding entry blocks).
279  getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
280 
281  // Return true if the callee of CB should be partially inlined with
282  // profit.
283  bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
284  BlockFrequency WeightedOutliningRcost,
285  OptimizationRemarkEmitter &ORE) const;
286 
287  // Try to inline DuplicateFunction (cloned from F with call to
288  // the OutlinedFunction into its callers. Return true
289  // if there is any successful inlining.
290  bool tryPartialInline(FunctionCloner &Cloner);
291 
292  // Compute the mapping from use site of DuplicationFunction to the enclosing
293  // BB's profile count.
294  void
295  computeCallsiteToProfCountMap(Function *DuplicateFunction,
296  DenseMap<User *, uint64_t> &SiteCountMap) const;
297 
298  bool isLimitReached() const {
299  return (MaxNumPartialInlining != -1 &&
300  NumPartialInlining >= MaxNumPartialInlining);
301  }
302 
303  static CallBase *getSupportedCallBase(User *U) {
304  if (isa<CallInst>(U) || isa<InvokeInst>(U))
305  return cast<CallBase>(U);
306  llvm_unreachable("All uses must be calls");
307  return nullptr;
308  }
309 
310  static CallBase *getOneCallSiteTo(Function &F) {
311  User *User = *F.user_begin();
312  return getSupportedCallBase(User);
313  }
314 
315  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
316  CallBase *CB = getOneCallSiteTo(F);
317  DebugLoc DLoc = CB->getDebugLoc();
318  BasicBlock *Block = CB->getParent();
319  return std::make_tuple(DLoc, Block);
320  }
321 
322  // Returns the costs associated with function outlining:
323  // - The first value is the non-weighted runtime cost for making the call
324  // to the outlined function, including the addtional setup cost in the
325  // outlined function itself;
326  // - The second value is the estimated size of the new call sequence in
327  // basic block Cloner.OutliningCallBB;
328  std::tuple<InstructionCost, InstructionCost>
329  computeOutliningCosts(FunctionCloner &Cloner) const;
330 
331  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
332  // approximate both the size and runtime cost (Note that in the current
333  // inline cost analysis, there is no clear distinction there either).
334  static InstructionCost computeBBInlineCost(BasicBlock *BB,
336 
337  std::unique_ptr<FunctionOutliningInfo>
338  computeOutliningInfo(Function &F) const;
339 
340  std::unique_ptr<FunctionOutliningMultiRegionInfo>
341  computeOutliningColdRegionsInfo(Function &F,
342  OptimizationRemarkEmitter &ORE) const;
343 };
344 
345 struct PartialInlinerLegacyPass : public ModulePass {
346  static char ID; // Pass identification, replacement for typeid
347 
348  PartialInlinerLegacyPass() : ModulePass(ID) {
350  }
351 
352  void getAnalysisUsage(AnalysisUsage &AU) const override {
357  }
358 
359  bool runOnModule(Module &M) override {
360  if (skipModule(M))
361  return false;
362 
363  AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
365  &getAnalysis<TargetTransformInfoWrapperPass>();
366  ProfileSummaryInfo &PSI =
367  getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
368 
369  auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
370  return ACT->getAssumptionCache(F);
371  };
372 
373  auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
374  return ACT->lookupAssumptionCache(F);
375  };
376 
377  auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
378  return TTIWP->getTTI(F);
379  };
380 
381  auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
382  return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
383  };
384 
385  return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
386  GetTLI, PSI)
387  .run(M);
388  }
389 };
390 
391 } // end anonymous namespace
392 
393 std::unique_ptr<FunctionOutliningMultiRegionInfo>
394 PartialInlinerImpl::computeOutliningColdRegionsInfo(
395  Function &F, OptimizationRemarkEmitter &ORE) const {
396  BasicBlock *EntryBlock = &F.front();
397 
398  DominatorTree DT(F);
399  LoopInfo LI(DT);
400  BranchProbabilityInfo BPI(F, LI);
401  std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
403  if (!GetBFI) {
404  ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
405  BFI = ScopedBFI.get();
406  } else
407  BFI = &(GetBFI(F));
408 
409  // Return if we don't have profiling information.
410  if (!PSI.hasInstrumentationProfile())
411  return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
412 
413  std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
414  std::make_unique<FunctionOutliningMultiRegionInfo>();
415 
416  auto IsSingleExit =
417  [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
418  BasicBlock *ExitBlock = nullptr;
419  for (auto *Block : BlockList) {
420  for (BasicBlock *Succ : successors(Block)) {
421  if (!is_contained(BlockList, Succ)) {
422  if (ExitBlock) {
423  ORE.emit([&]() {
424  return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
425  &Succ->front())
426  << "Region dominated by "
427  << ore::NV("Block", BlockList.front()->getName())
428  << " has more than one region exit edge.";
429  });
430  return nullptr;
431  }
432 
433  ExitBlock = Block;
434  }
435  }
436  }
437  return ExitBlock;
438  };
439 
440  auto BBProfileCount = [BFI](BasicBlock *BB) {
441  return BFI->getBlockProfileCount(BB).value_or(0);
442  };
443 
444  // Use the same computeBBInlineCost function to compute the cost savings of
445  // the outlining the candidate region.
446  TargetTransformInfo *FTTI = &GetTTI(F);
447  InstructionCost OverallFunctionCost = 0;
448  for (auto &BB : F)
449  OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
450 
451  LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
452  << "\n";);
453 
454  InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
455  [&](auto Cost) { return Cost * MinRegionSizeRatio; });
456 
457  BranchProbability MinBranchProbability(
458  static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
460  bool ColdCandidateFound = false;
461  BasicBlock *CurrEntry = EntryBlock;
462  std::vector<BasicBlock *> DFS;
463  DenseMap<BasicBlock *, bool> VisitedMap;
464  DFS.push_back(CurrEntry);
465  VisitedMap[CurrEntry] = true;
466 
467  // Use Depth First Search on the basic blocks to find CFG edges that are
468  // considered cold.
469  // Cold regions considered must also have its inline cost compared to the
470  // overall inline cost of the original function. The region is outlined only
471  // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
472  // more.
473  while (!DFS.empty()) {
474  auto *ThisBB = DFS.back();
475  DFS.pop_back();
476  // Only consider regions with predecessor blocks that are considered
477  // not-cold (default: part of the top 99.99% of all block counters)
478  // AND greater than our minimum block execution count (default: 100).
479  if (PSI.isColdBlock(ThisBB, BFI) ||
480  BBProfileCount(ThisBB) < MinBlockCounterExecution)
481  continue;
482  for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
483  if (VisitedMap[*SI])
484  continue;
485  VisitedMap[*SI] = true;
486  DFS.push_back(*SI);
487  // If branch isn't cold, we skip to the next one.
488  BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
489  if (SuccProb > MinBranchProbability)
490  continue;
491 
492  LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
493  << SI->getName()
494  << "\nBranch Probability = " << SuccProb << "\n";);
495 
496  SmallVector<BasicBlock *, 8> DominateVector;
497  DT.getDescendants(*SI, DominateVector);
498  assert(!DominateVector.empty() &&
499  "SI should be reachable and have at least itself as descendant");
500 
501  // We can only outline single entry regions (for now).
502  if (!DominateVector.front()->hasNPredecessors(1)) {
503  LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
504  << " doesn't have a single predecessor in the "
505  "dominator tree\n";);
506  continue;
507  }
508 
509  BasicBlock *ExitBlock = nullptr;
510  // We can only outline single exit regions (for now).
511  if (!(ExitBlock = IsSingleExit(DominateVector))) {
512  LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
513  << " doesn't have a unique successor\n";);
514  continue;
515  }
516 
517  InstructionCost OutlineRegionCost = 0;
518  for (auto *BB : DominateVector)
519  OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
520 
521  LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
522  << "\n";);
523 
524  if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
525  ORE.emit([&]() {
526  return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
527  &SI->front())
528  << ore::NV("Callee", &F)
529  << " inline cost-savings smaller than "
530  << ore::NV("Cost", MinOutlineRegionCost);
531  });
532 
533  LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
534  << MinOutlineRegionCost << "\n";);
535  continue;
536  }
537 
538  // For now, ignore blocks that belong to a SISE region that is a
539  // candidate for outlining. In the future, we may want to look
540  // at inner regions because the outer region may have live-exit
541  // variables.
542  for (auto *BB : DominateVector)
543  VisitedMap[BB] = true;
544 
545  // ReturnBlock here means the block after the outline call
546  BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
547  FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
548  DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
549  OutliningInfo->ORI.push_back(RegInfo);
550  LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
551  << DominateVector.front()->getName() << "\n";);
552  ColdCandidateFound = true;
553  NumColdRegionsFound++;
554  }
555  }
556 
557  if (ColdCandidateFound)
558  return OutliningInfo;
559 
560  return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
561 }
562 
563 std::unique_ptr<FunctionOutliningInfo>
564 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
565  BasicBlock *EntryBlock = &F.front();
566  BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
567  if (!BR || BR->isUnconditional())
568  return std::unique_ptr<FunctionOutliningInfo>();
569 
570  // Returns true if Succ is BB's successor
571  auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
572  return is_contained(successors(BB), Succ);
573  };
574 
575  auto IsReturnBlock = [](BasicBlock *BB) {
576  Instruction *TI = BB->getTerminator();
577  return isa<ReturnInst>(TI);
578  };
579 
580  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
581  if (IsReturnBlock(Succ1))
582  return std::make_tuple(Succ1, Succ2);
583  if (IsReturnBlock(Succ2))
584  return std::make_tuple(Succ2, Succ1);
585 
586  return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
587  };
588 
589  // Detect a triangular shape:
590  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
591  if (IsSuccessor(Succ1, Succ2))
592  return std::make_tuple(Succ1, Succ2);
593  if (IsSuccessor(Succ2, Succ1))
594  return std::make_tuple(Succ2, Succ1);
595 
596  return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
597  };
598 
599  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
600  std::make_unique<FunctionOutliningInfo>();
601 
602  BasicBlock *CurrEntry = EntryBlock;
603  bool CandidateFound = false;
604  do {
605  // The number of blocks to be inlined has already reached
606  // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
607  // disables partial inlining for the function.
608  if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
609  break;
610 
611  if (succ_size(CurrEntry) != 2)
612  break;
613 
614  BasicBlock *Succ1 = *succ_begin(CurrEntry);
615  BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
616 
617  BasicBlock *ReturnBlock, *NonReturnBlock;
618  std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
619 
620  if (ReturnBlock) {
621  OutliningInfo->Entries.push_back(CurrEntry);
622  OutliningInfo->ReturnBlock = ReturnBlock;
623  OutliningInfo->NonReturnBlock = NonReturnBlock;
624  CandidateFound = true;
625  break;
626  }
627 
628  BasicBlock *CommSucc, *OtherSucc;
629  std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
630 
631  if (!CommSucc)
632  break;
633 
634  OutliningInfo->Entries.push_back(CurrEntry);
635  CurrEntry = OtherSucc;
636  } while (true);
637 
638  if (!CandidateFound)
639  return std::unique_ptr<FunctionOutliningInfo>();
640 
641  // There should not be any successors (not in the entry set) other than
642  // {ReturnBlock, NonReturnBlock}
643  assert(OutliningInfo->Entries[0] == &F.front() &&
644  "Function Entry must be the first in Entries vector");
645  DenseSet<BasicBlock *> Entries;
646  for (BasicBlock *E : OutliningInfo->Entries)
647  Entries.insert(E);
648 
649  // Returns true of BB has Predecessor which is not
650  // in Entries set.
651  auto HasNonEntryPred = [Entries](BasicBlock *BB) {
652  for (auto *Pred : predecessors(BB)) {
653  if (!Entries.count(Pred))
654  return true;
655  }
656  return false;
657  };
658  auto CheckAndNormalizeCandidate =
659  [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
660  for (BasicBlock *E : OutliningInfo->Entries) {
661  for (auto *Succ : successors(E)) {
662  if (Entries.count(Succ))
663  continue;
664  if (Succ == OutliningInfo->ReturnBlock)
665  OutliningInfo->ReturnBlockPreds.push_back(E);
666  else if (Succ != OutliningInfo->NonReturnBlock)
667  return false;
668  }
669  // There should not be any outside incoming edges either:
670  if (HasNonEntryPred(E))
671  return false;
672  }
673  return true;
674  };
675 
676  if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
677  return std::unique_ptr<FunctionOutliningInfo>();
678 
679  // Now further growing the candidate's inlining region by
680  // peeling off dominating blocks from the outlining region:
681  while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
682  BasicBlock *Cand = OutliningInfo->NonReturnBlock;
683  if (succ_size(Cand) != 2)
684  break;
685 
686  if (HasNonEntryPred(Cand))
687  break;
688 
689  BasicBlock *Succ1 = *succ_begin(Cand);
690  BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
691 
692  BasicBlock *ReturnBlock, *NonReturnBlock;
693  std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
694  if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
695  break;
696 
697  if (NonReturnBlock->getSinglePredecessor() != Cand)
698  break;
699 
700  // Now grow and update OutlininigInfo:
701  OutliningInfo->Entries.push_back(Cand);
702  OutliningInfo->NonReturnBlock = NonReturnBlock;
703  OutliningInfo->ReturnBlockPreds.push_back(Cand);
704  Entries.insert(Cand);
705  }
706 
707  return OutliningInfo;
708 }
709 
710 // Check if there is PGO data or user annotated branch data:
711 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
712  if (F.hasProfileData())
713  return true;
714  // Now check if any of the entry block has MD_prof data:
715  for (auto *E : OI.Entries) {
716  BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
717  if (!BR || BR->isUnconditional())
718  continue;
719  uint64_t T, F;
720  if (extractBranchWeights(*BR, T, F))
721  return true;
722  }
723  return false;
724 }
725 
726 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
727  FunctionCloner &Cloner) const {
728  BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
729  auto EntryFreq =
730  Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
731  auto OutliningCallFreq =
732  Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
733  // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
734  // we outlined any regions, so we may encounter situations where the
735  // OutliningCallFreq is *slightly* bigger than the EntryFreq.
736  if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
737  OutliningCallFreq = EntryFreq;
738 
739  auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
740  OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
741 
742  if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))
743  return OutlineRegionRelFreq;
744 
745  // When profile data is not available, we need to be conservative in
746  // estimating the overall savings. Static branch prediction can usually
747  // guess the branch direction right (taken/non-taken), but the guessed
748  // branch probability is usually not biased enough. In case when the
749  // outlined region is predicted to be likely, its probability needs
750  // to be made higher (more biased) to not under-estimate the cost of
751  // function outlining. On the other hand, if the outlined region
752  // is predicted to be less likely, the predicted probablity is usually
753  // higher than the actual. For instance, the actual probability of the
754  // less likely target is only 5%, but the guessed probablity can be
755  // 40%. In the latter case, there is no need for further adjustment.
756  // FIXME: add an option for this.
757  if (OutlineRegionRelFreq < BranchProbability(45, 100))
758  return OutlineRegionRelFreq;
759 
760  OutlineRegionRelFreq = std::max(
761  OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
762 
763  return OutlineRegionRelFreq;
764 }
765 
766 bool PartialInlinerImpl::shouldPartialInline(
767  CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
768  OptimizationRemarkEmitter &ORE) const {
769  using namespace ore;
770 
772  assert(Callee == Cloner.ClonedFunc);
773 
774  if (SkipCostAnalysis)
775  return isInlineViable(*Callee).isSuccess();
776 
777  Function *Caller = CB.getCaller();
778  auto &CalleeTTI = GetTTI(*Callee);
779  bool RemarksEnabled =
780  Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
781  DEBUG_TYPE);
782  InlineCost IC =
783  getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
784  GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
785 
786  if (IC.isAlways()) {
787  ORE.emit([&]() {
788  return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
789  << NV("Callee", Cloner.OrigFunc)
790  << " should always be fully inlined, not partially";
791  });
792  return false;
793  }
794 
795  if (IC.isNever()) {
796  ORE.emit([&]() {
797  return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
798  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
799  << NV("Caller", Caller)
800  << " because it should never be inlined (cost=never)";
801  });
802  return false;
803  }
804 
805  if (!IC) {
806  ORE.emit([&]() {
807  return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
808  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
809  << NV("Caller", Caller) << " because too costly to inline (cost="
810  << NV("Cost", IC.getCost()) << ", threshold="
811  << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
812  });
813  return false;
814  }
815  const DataLayout &DL = Caller->getParent()->getDataLayout();
816 
817  // The savings of eliminating the call:
818  int NonWeightedSavings = getCallsiteCost(CB, DL);
819  BlockFrequency NormWeightedSavings(NonWeightedSavings);
820 
821  // Weighted saving is smaller than weighted cost, return false
822  if (NormWeightedSavings < WeightedOutliningRcost) {
823  ORE.emit([&]() {
824  return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
825  &CB)
826  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
827  << NV("Caller", Caller) << " runtime overhead (overhead="
828  << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
829  << ", savings="
830  << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
831  << ")"
832  << " of making the outlined call is too high";
833  });
834 
835  return false;
836  }
837 
838  ORE.emit([&]() {
839  return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
840  << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
841  << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
842  << " (threshold="
843  << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
844  });
845  return true;
846 }
847 
848 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
849 // For now use a simplified version. The returned 'InlineCost' will be used
850 // to esimate the size cost as well as runtime cost of the BB.
852 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
855  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
857  for (Instruction &I : BB->instructionsWithoutDebug()) {
858  // Skip free instructions.
859  switch (I.getOpcode()) {
860  case Instruction::BitCast:
861  case Instruction::PtrToInt:
862  case Instruction::IntToPtr:
863  case Instruction::Alloca:
864  case Instruction::PHI:
865  continue;
866  case Instruction::GetElementPtr:
867  if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
868  continue;
869  break;
870  default:
871  break;
872  }
873 
874  if (I.isLifetimeStartOrEnd())
875  continue;
876 
877  if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
878  Intrinsic::ID IID = II->getIntrinsicID();
880  FastMathFlags FMF;
881  for (Value *Val : II->args())
882  Tys.push_back(Val->getType());
883 
884  if (auto *FPMO = dyn_cast<FPMathOperator>(II))
885  FMF = FPMO->getFastMathFlags();
886 
887  IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
889  continue;
890  }
891 
892  if (CallInst *CI = dyn_cast<CallInst>(&I)) {
893  InlineCost += getCallsiteCost(*CI, DL);
894  continue;
895  }
896 
897  if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
898  InlineCost += getCallsiteCost(*II, DL);
899  continue;
900  }
901 
902  if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
903  InlineCost += (SI->getNumCases() + 1) * InstrCost;
904  continue;
905  }
907  }
908 
909  return InlineCost;
910 }
911 
912 std::tuple<InstructionCost, InstructionCost>
913 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
914  InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
915  for (auto FuncBBPair : Cloner.OutlinedFunctions) {
916  Function *OutlinedFunc = FuncBBPair.first;
917  BasicBlock* OutliningCallBB = FuncBBPair.second;
918  // Now compute the cost of the call sequence to the outlined function
919  // 'OutlinedFunction' in BB 'OutliningCallBB':
920  auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
921  OutliningFuncCallCost +=
922  computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
923 
924  // Now compute the cost of the extracted/outlined function itself:
925  for (BasicBlock &BB : *OutlinedFunc)
926  OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
927  }
928  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
929  "Outlined function cost should be no less than the outlined region");
930 
931  // The code extractor introduces a new root and exit stub blocks with
932  // additional unconditional branches. Those branches will be eliminated
933  // later with bb layout. The cost should be adjusted accordingly:
934  OutlinedFunctionCost -=
935  2 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size();
936 
937  InstructionCost OutliningRuntimeOverhead =
938  OutliningFuncCallCost +
939  (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
940  ExtraOutliningPenalty.getValue();
941 
942  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
943 }
944 
945 // Create the callsite to profile count map which is
946 // used to update the original function's entry count,
947 // after the function is partially inlined into the callsite.
948 void PartialInlinerImpl::computeCallsiteToProfCountMap(
949  Function *DuplicateFunction,
950  DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
951  std::vector<User *> Users(DuplicateFunction->user_begin(),
952  DuplicateFunction->user_end());
953  Function *CurrentCaller = nullptr;
954  std::unique_ptr<BlockFrequencyInfo> TempBFI;
955  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
956 
957  auto ComputeCurrBFI = [&,this](Function *Caller) {
958  // For the old pass manager:
959  if (!GetBFI) {
960  DominatorTree DT(*Caller);
961  LoopInfo LI(DT);
962  BranchProbabilityInfo BPI(*Caller, LI);
963  TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
964  CurrentCallerBFI = TempBFI.get();
965  } else {
966  // New pass manager:
967  CurrentCallerBFI = &(GetBFI(*Caller));
968  }
969  };
970 
971  for (User *User : Users) {
972  // Don't bother with BlockAddress used by CallBr for asm goto.
973  if (isa<BlockAddress>(User))
974  continue;
975  CallBase *CB = getSupportedCallBase(User);
976  Function *Caller = CB->getCaller();
977  if (CurrentCaller != Caller) {
978  CurrentCaller = Caller;
979  ComputeCurrBFI(Caller);
980  } else {
981  assert(CurrentCallerBFI && "CallerBFI is not set");
982  }
983  BasicBlock *CallBB = CB->getParent();
984  auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
985  if (Count)
986  CallSiteToProfCountMap[User] = *Count;
987  else
988  CallSiteToProfCountMap[User] = 0;
989  }
990 }
991 
992 PartialInlinerImpl::FunctionCloner::FunctionCloner(
993  Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
994  function_ref<AssumptionCache *(Function &)> LookupAC,
996  : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
997  ClonedOI = std::make_unique<FunctionOutliningInfo>();
998 
999  // Clone the function, so that we can hack away on it.
1000  ValueToValueMapTy VMap;
1001  ClonedFunc = CloneFunction(F, VMap);
1002 
1003  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
1004  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
1005  for (BasicBlock *BB : OI->Entries)
1006  ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
1007 
1008  for (BasicBlock *E : OI->ReturnBlockPreds) {
1009  BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
1010  ClonedOI->ReturnBlockPreds.push_back(NewE);
1011  }
1012  // Go ahead and update all uses to the duplicate, so that we can just
1013  // use the inliner functionality when we're done hacking.
1014  F->replaceAllUsesWith(ClonedFunc);
1015 }
1016 
1017 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1018  Function *F, FunctionOutliningMultiRegionInfo *OI,
1020  function_ref<AssumptionCache *(Function &)> LookupAC,
1022  : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
1023  ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1024 
1025  // Clone the function, so that we can hack away on it.
1026  ValueToValueMapTy VMap;
1027  ClonedFunc = CloneFunction(F, VMap);
1028 
1029  // Go through all Outline Candidate Regions and update all BasicBlock
1030  // information.
1031  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1032  OI->ORI) {
1034  for (BasicBlock *BB : RegionInfo.Region)
1035  Region.push_back(cast<BasicBlock>(VMap[BB]));
1036 
1037  BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1038  BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1039  BasicBlock *NewReturnBlock = nullptr;
1040  if (RegionInfo.ReturnBlock)
1041  NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1042  FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1043  Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1044  ClonedOMRI->ORI.push_back(MappedRegionInfo);
1045  }
1046  // Go ahead and update all uses to the duplicate, so that we can just
1047  // use the inliner functionality when we're done hacking.
1048  F->replaceAllUsesWith(ClonedFunc);
1049 }
1050 
1051 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1052  auto GetFirstPHI = [](BasicBlock *BB) {
1053  BasicBlock::iterator I = BB->begin();
1054  PHINode *FirstPhi = nullptr;
1055  while (I != BB->end()) {
1056  PHINode *Phi = dyn_cast<PHINode>(I);
1057  if (!Phi)
1058  break;
1059  if (!FirstPhi) {
1060  FirstPhi = Phi;
1061  break;
1062  }
1063  }
1064  return FirstPhi;
1065  };
1066 
1067  // Shouldn't need to normalize PHIs if we're not outlining non-early return
1068  // blocks.
1069  if (!ClonedOI)
1070  return;
1071 
1072  // Special hackery is needed with PHI nodes that have inputs from more than
1073  // one extracted block. For simplicity, just split the PHIs into a two-level
1074  // sequence of PHIs, some of which will go in the extracted region, and some
1075  // of which will go outside.
1076  BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1077  // only split block when necessary:
1078  PHINode *FirstPhi = GetFirstPHI(PreReturn);
1079  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1080 
1081  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1082  return;
1083 
1084  auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1085  if (llvm::all_equal(PN->incoming_values()))
1086  return PN->getIncomingValue(0);
1087  return nullptr;
1088  };
1089 
1090  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1091  ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1092  BasicBlock::iterator I = PreReturn->begin();
1093  Instruction *Ins = &ClonedOI->ReturnBlock->front();
1095  while (I != PreReturn->end()) {
1096  PHINode *OldPhi = dyn_cast<PHINode>(I);
1097  if (!OldPhi)
1098  break;
1099 
1100  PHINode *RetPhi =
1101  PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1102  OldPhi->replaceAllUsesWith(RetPhi);
1103  Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1104 
1105  RetPhi->addIncoming(&*I, PreReturn);
1106  for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1107  RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1108  OldPhi->removeIncomingValue(E);
1109  }
1110 
1111  // After incoming values splitting, the old phi may become trivial.
1112  // Keeping the trivial phi can introduce definition inside the outline
1113  // region which is live-out, causing necessary overhead (load, store
1114  // arg passing etc).
1115  if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1116  OldPhi->replaceAllUsesWith(OldPhiVal);
1117  DeadPhis.push_back(OldPhi);
1118  }
1119  ++I;
1120  }
1121  for (auto *DP : DeadPhis)
1122  DP->eraseFromParent();
1123 
1124  for (auto *E : ClonedOI->ReturnBlockPreds)
1125  E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1126 }
1127 
1128 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1129 
1130  auto ComputeRegionCost =
1132  InstructionCost Cost = 0;
1133  for (BasicBlock* BB : Region)
1134  Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1135  return Cost;
1136  };
1137 
1138  assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1139 
1140  if (ClonedOMRI->ORI.empty())
1141  return false;
1142 
1143  // The CodeExtractor needs a dominator tree.
1144  DominatorTree DT;
1145  DT.recalculate(*ClonedFunc);
1146 
1147  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1148  LoopInfo LI(DT);
1149  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1150  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1151 
1152  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1153  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1154 
1155  SetVector<Value *> Inputs, Outputs, Sinks;
1156  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1157  ClonedOMRI->ORI) {
1158  InstructionCost CurrentOutlinedRegionCost =
1159  ComputeRegionCost(RegionInfo.Region);
1160 
1161  CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1162  ClonedFuncBFI.get(), &BPI,
1163  LookupAC(*RegionInfo.EntryBlock->getParent()),
1164  /* AllowVarargs */ false);
1165 
1166  CE.findInputsOutputs(Inputs, Outputs, Sinks);
1167 
1168  LLVM_DEBUG({
1169  dbgs() << "inputs: " << Inputs.size() << "\n";
1170  dbgs() << "outputs: " << Outputs.size() << "\n";
1171  for (Value *value : Inputs)
1172  dbgs() << "value used in func: " << *value << "\n";
1173  for (Value *output : Outputs)
1174  dbgs() << "instr used in func: " << *output << "\n";
1175  });
1176 
1177  // Do not extract regions that have live exit variables.
1178  if (Outputs.size() > 0 && !ForceLiveExit)
1179  continue;
1180 
1181  if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1182  CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1183  BasicBlock *OutliningCallBB = OCS->getParent();
1184  assert(OutliningCallBB->getParent() == ClonedFunc);
1185  OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1186  NumColdRegionsOutlined++;
1187  OutlinedRegionCost += CurrentOutlinedRegionCost;
1188 
1189  if (MarkOutlinedColdCC) {
1190  OutlinedFunc->setCallingConv(CallingConv::Cold);
1192  }
1193  } else
1194  ORE.emit([&]() {
1195  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1196  &RegionInfo.Region.front()->front())
1197  << "Failed to extract region at block "
1198  << ore::NV("Block", RegionInfo.Region.front());
1199  });
1200  }
1201 
1202  return !OutlinedFunctions.empty();
1203 }
1204 
1205 Function *
1206 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1207  // Returns true if the block is to be partial inlined into the caller
1208  // (i.e. not to be extracted to the out of line function)
1209  auto ToBeInlined = [&, this](BasicBlock *BB) {
1210  return BB == ClonedOI->ReturnBlock ||
1211  llvm::is_contained(ClonedOI->Entries, BB);
1212  };
1213 
1214  assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1215  // The CodeExtractor needs a dominator tree.
1216  DominatorTree DT;
1217  DT.recalculate(*ClonedFunc);
1218 
1219  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1220  LoopInfo LI(DT);
1221  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1222  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1223 
1224  // Gather up the blocks that we're going to extract.
1225  std::vector<BasicBlock *> ToExtract;
1226  auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1227  ToExtract.push_back(ClonedOI->NonReturnBlock);
1228  OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1229  ClonedOI->NonReturnBlock, ClonedFuncTTI);
1230  for (BasicBlock &BB : *ClonedFunc)
1231  if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1232  ToExtract.push_back(&BB);
1233  // FIXME: the code extractor may hoist/sink more code
1234  // into the outlined function which may make the outlining
1235  // overhead (the difference of the outlined function cost
1236  // and OutliningRegionCost) look larger.
1237  OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
1238  }
1239 
1240  // Extract the body of the if.
1241  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1242  Function *OutlinedFunc =
1243  CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1244  ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1245  /* AllowVarargs */ true)
1246  .extractCodeRegion(CEAC);
1247 
1248  if (OutlinedFunc) {
1249  BasicBlock *OutliningCallBB =
1250  PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1251  assert(OutliningCallBB->getParent() == ClonedFunc);
1252  OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1253  } else
1254  ORE.emit([&]() {
1255  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1256  &ToExtract.front()->front())
1257  << "Failed to extract region at block "
1258  << ore::NV("Block", ToExtract.front());
1259  });
1260 
1261  return OutlinedFunc;
1262 }
1263 
1264 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1265  // Ditch the duplicate, since we're done with it, and rewrite all remaining
1266  // users (function pointers, etc.) back to the original function.
1267  ClonedFunc->replaceAllUsesWith(OrigFunc);
1268  ClonedFunc->eraseFromParent();
1269  if (!IsFunctionInlined) {
1270  // Remove each function that was speculatively created if there is no
1271  // reference.
1272  for (auto FuncBBPair : OutlinedFunctions) {
1273  Function *Func = FuncBBPair.first;
1274  Func->eraseFromParent();
1275  }
1276  }
1277 }
1278 
1279 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1280  if (F.hasAddressTaken())
1281  return {false, nullptr};
1282 
1283  // Let inliner handle it
1284  if (F.hasFnAttribute(Attribute::AlwaysInline))
1285  return {false, nullptr};
1286 
1287  if (F.hasFnAttribute(Attribute::NoInline))
1288  return {false, nullptr};
1289 
1290  if (PSI.isFunctionEntryCold(&F))
1291  return {false, nullptr};
1292 
1293  if (F.users().empty())
1294  return {false, nullptr};
1295 
1297 
1298  // Only try to outline cold regions if we have a profile summary, which
1299  // implies we have profiling information.
1300  if (PSI.hasProfileSummary() && F.hasProfileData() &&
1302  std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1303  computeOutliningColdRegionsInfo(F, ORE);
1304  if (OMRI) {
1305  FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1306 
1307  LLVM_DEBUG({
1308  dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1309  dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1310  << "\n";
1311  });
1312 
1313  bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1314 
1315  if (DidOutline) {
1316  LLVM_DEBUG({
1317  dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1318  Cloner.ClonedFunc->print(dbgs());
1319  dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1320  });
1321 
1322  if (tryPartialInline(Cloner))
1323  return {true, nullptr};
1324  }
1325  }
1326  }
1327 
1328  // Fall-thru to regular partial inlining if we:
1329  // i) can't find any cold regions to outline, or
1330  // ii) can't inline the outlined function anywhere.
1331  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1332  if (!OI)
1333  return {false, nullptr};
1334 
1335  FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1336  Cloner.normalizeReturnBlock();
1337 
1338  Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1339 
1340  if (!OutlinedFunction)
1341  return {false, nullptr};
1342 
1343  if (tryPartialInline(Cloner))
1344  return {true, OutlinedFunction};
1345 
1346  return {false, nullptr};
1347 }
1348 
1349 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1350  if (Cloner.OutlinedFunctions.empty())
1351  return false;
1352 
1353  auto OutliningCosts = computeOutliningCosts(Cloner);
1354 
1355  InstructionCost SizeCost = std::get<0>(OutliningCosts);
1356  InstructionCost NonWeightedRcost = std::get<1>(OutliningCosts);
1357 
1358  assert(SizeCost.isValid() && NonWeightedRcost.isValid() &&
1359  "Expected valid costs");
1360 
1361  // Only calculate RelativeToEntryFreq when we are doing single region
1362  // outlining.
1363  BranchProbability RelativeToEntryFreq;
1364  if (Cloner.ClonedOI)
1365  RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1366  else
1367  // RelativeToEntryFreq doesn't make sense when we have more than one
1368  // outlined call because each call will have a different relative frequency
1369  // to the entry block. We can consider using the average, but the
1370  // usefulness of that information is questionable. For now, assume we never
1371  // execute the calls to outlined functions.
1372  RelativeToEntryFreq = BranchProbability(0, 1);
1373 
1374  BlockFrequency WeightedRcost =
1375  BlockFrequency(*NonWeightedRcost.getValue()) * RelativeToEntryFreq;
1376 
1377  // The call sequence(s) to the outlined function(s) are larger than the sum of
1378  // the original outlined region size(s), it does not increase the chances of
1379  // inlining the function with outlining (The inliner uses the size increase to
1380  // model the cost of inlining a callee).
1381  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1382  OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1383  DebugLoc DLoc;
1384  BasicBlock *Block;
1385  std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1386  OrigFuncORE.emit([&]() {
1387  return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1388  DLoc, Block)
1389  << ore::NV("Function", Cloner.OrigFunc)
1390  << " not partially inlined into callers (Original Size = "
1391  << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1392  << ", Size of call sequence to outlined function = "
1393  << ore::NV("NewSize", SizeCost) << ")";
1394  });
1395  return false;
1396  }
1397 
1398  assert(Cloner.OrigFunc->users().empty() &&
1399  "F's users should all be replaced!");
1400 
1401  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1402  Cloner.ClonedFunc->user_end());
1403 
1404  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1405  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1406  if (CalleeEntryCount)
1407  computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1408 
1409  uint64_t CalleeEntryCountV =
1410  (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1411 
1412  bool AnyInline = false;
1413  for (User *User : Users) {
1414  // Don't bother with BlockAddress used by CallBr for asm goto.
1415  if (isa<BlockAddress>(User))
1416  continue;
1417 
1418  CallBase *CB = getSupportedCallBase(User);
1419 
1420  if (isLimitReached())
1421  continue;
1422 
1423  OptimizationRemarkEmitter CallerORE(CB->getCaller());
1424  if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1425  continue;
1426 
1427  // Construct remark before doing the inlining, as after successful inlining
1428  // the callsite is removed.
1429  OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1430  OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1431  << ore::NV("Caller", CB->getCaller());
1432 
1433  InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1434  // We can only forward varargs when we outlined a single region, else we
1435  // bail on vararg functions.
1436  if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true,
1437  (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1438  : nullptr))
1439  .isSuccess())
1440  continue;
1441 
1442  CallerORE.emit(OR);
1443 
1444  // Now update the entry count:
1445  if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1446  uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1447  CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1448  }
1449 
1450  AnyInline = true;
1451  NumPartialInlining++;
1452  // Update the stats
1453  if (Cloner.ClonedOI)
1454  NumPartialInlined++;
1455  else
1456  NumColdOutlinePartialInlined++;
1457  }
1458 
1459  if (AnyInline) {
1460  Cloner.IsFunctionInlined = true;
1461  if (CalleeEntryCount)
1462  Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1463  CalleeEntryCountV, CalleeEntryCount->getType()));
1464  OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1465  OrigFuncORE.emit([&]() {
1466  return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1467  << "Partially inlined into at least one caller";
1468  });
1469  }
1470 
1471  return AnyInline;
1472 }
1473 
1476  return false;
1477 
1478  std::vector<Function *> Worklist;
1479  Worklist.reserve(M.size());
1480  for (Function &F : M)
1481  if (!F.use_empty() && !F.isDeclaration())
1482  Worklist.push_back(&F);
1483 
1484  bool Changed = false;
1485  while (!Worklist.empty()) {
1486  Function *CurrFunc = Worklist.back();
1487  Worklist.pop_back();
1488 
1489  if (CurrFunc->use_empty())
1490  continue;
1491 
1492  bool Recursive = false;
1493  for (User *U : CurrFunc->users())
1494  if (Instruction *I = dyn_cast<Instruction>(U))
1495  if (I->getParent()->getParent() == CurrFunc) {
1496  Recursive = true;
1497  break;
1498  }
1499  if (Recursive)
1500  continue;
1501 
1502  std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1503  if (Result.second)
1504  Worklist.push_back(Result.second);
1505  Changed |= Result.first;
1506  }
1507 
1508  return Changed;
1509 }
1510 
1512 
1513 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1514  "Partial Inliner", false, false)
1519 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1520  "Partial Inliner", false, false)
1521 
1523  return new PartialInlinerLegacyPass();
1524 }
1525 
1527  ModuleAnalysisManager &AM) {
1528  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1529 
1530  auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1531  return FAM.getResult<AssumptionAnalysis>(F);
1532  };
1533 
1534  auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1536  };
1537 
1538  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1540  };
1541 
1542  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1543  return FAM.getResult<TargetIRAnalysis>(F);
1544  };
1545 
1546  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1548  };
1549 
1551 
1552  if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1553  GetTLI, PSI, GetBFI)
1554  .run(M))
1555  return PreservedAnalyses::none();
1556  return PreservedAnalyses::all();
1557 }
llvm::InstructionCost
Definition: InstructionCost.h:30
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
llvm::InlineCost::isAlways
bool isAlways() const
Definition: InlineCost.h:139
llvm::InlineCost::getCost
int getCost() const
Get the inline cost estimate.
Definition: InlineCost.h:145
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:308
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2592
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:734
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
ValueMapper.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:101
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:248
llvm::InstructionCost::map
auto map(const Function &F) const -> InstructionCost
Definition: InstructionCost.h:246
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
IntrinsicInst.h
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
PartialInlining.h
llvm::succ_end
Interval::succ_iterator succ_end(Interval *I)
Definition: Interval.h:102
T
llvm::Function
Definition: Function.h:60
DEBUG_TYPE
#define DEBUG_TYPE
Definition: PartialInlining.cpp:64
Pass.h
llvm::CodeExtractor::extractCodeRegion
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
Definition: CodeExtractor.cpp:1629
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::TargetTransformInfoWrapperPass::getTTI
TargetTransformInfo & getTTI(const Function &F)
Definition: TargetTransformInfo.cpp:1227
llvm::ProfileSummaryInfo::hasProfileSummary
bool hasProfileSummary() const
Returns true if profile summary is available.
Definition: ProfileSummaryInfo.h:69
llvm::InlineFunction
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
Definition: InlineFunction.cpp:2048
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
Statistic.h
output
Current output
Definition: README.txt:1350
ErrorHandling.h
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:173
llvm::TargetTransformInfo::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfo.cpp:973
llvm::PHINode::removeIncomingValue
Value * removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty=true)
Remove an incoming value.
Definition: Instructions.cpp:116
llvm::CallingConv::Cold
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
FAM
FunctionAnalysisManager FAM
Definition: PassBuilderBindings.cpp:59
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
MinBlockCounterExecution
static cl::opt< unsigned > MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, cl::desc("Minimum block executions to consider " "its BranchProbabilityInfo valid"))
llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
BlockFrequency.h
DenseMap.h
Module.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::BasicBlock::getSingleSuccessor
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:315
llvm::ProfileSummaryInfo::isFunctionEntryCold
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
Definition: ProfileSummaryInfo.cpp:223
llvm::isInlineViable
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
Definition: InlineCost.cpp:2989
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
llvm::BranchProbability::getBranchProbability
static BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
Definition: BranchProbability.cpp:53
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
Operator.h
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
CodeExtractor.h
llvm::successors
auto successors(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:29
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:285
STLExtras.h
llvm::InlineCost::isNever
bool isNever() const
Definition: InlineCost.h:140
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:140
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
ColdBranchRatio
static cl::opt< float > ColdBranchRatio("cold-branch-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum BranchProbability to consider a region cold."))
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
InstrCost
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Instruction.h
CommandLine.h
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:24
llvm::getInlineCost
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Definition: InlineCost.cpp:2822
InlinePriorityMode::Cost
@ Cost
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::User
Definition: User.h:44
Intrinsics.h
InstrTypes.h
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1396
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:306
llvm::BranchProbabilityInfo
Analysis providing branch probability information.
Definition: BranchProbabilityInfo.h:113
SI
@ SI
Definition: SIInstrInfo.cpp:7966
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::InlineCost
Represents the cost of inlining a function.
Definition: InlineCost.h:90
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
TargetLibraryInfo.h
DenseSet.h
false
Definition: StackSlotColoring.cpp:141
llvm::InlineConstants::getInstrCost
int getInstrCost()
Definition: InlineCost.cpp:183
llvm::succ_size
unsigned succ_size(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:31
llvm::AssumptionCacheTracker::lookupAssumptionCache
AssumptionCache * lookupAssumptionCache(Function &F)
Return the cached assumptions for a function if it has already been scanned.
Definition: AssumptionCache.cpp:305
llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition: Instructions.h:2882
llvm::pdb::PDB_SymType::Caller
@ Caller
llvm::Instruction
Definition: Instruction.h:42
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::predecessors
auto predecessors(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:30
llvm::InlineResult::isSuccess
bool isSuccess() const
Definition: InlineCost.h:189
DebugLoc.h
llvm::getInlineParams
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
Definition: InlineCost.cpp:3101
llvm::BlockFrequencyAnalysis
Analysis pass which computes BlockFrequencyInfo.
Definition: BlockFrequencyInfo.h:112
llvm::PHINode::getNumIncomingValues
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Definition: Instructions.h:2789
MinRegionSizeRatio
static cl::opt< float > MinRegionSizeRatio("min-region-size-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum ratio comparing relative sizes of each " "outline candidate and original function"))
llvm::Value::use_empty
bool use_empty() const
Definition: Value.h:344
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:120
BranchProbability.h
llvm::CallBase::getCaller
Function * getCaller()
Helper to get the caller (the parent function).
Definition: Instructions.cpp:285
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
CFG.h
llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition: PointerTypeAnalysis.cpp:189
LoopInfo.h
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:40
llvm::InvokeInst
Invoke instruction.
Definition: Instructions.h:3809
llvm::all_equal
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:1985
OutlineRegionFreqPercent
static cl::opt< int > OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), cl::Hidden, cl::desc("Relative frequency of outline region to " "the entry block"))
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:36
llvm::Value::user_end
user_iterator user_end()
Definition: Value.h:405
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
BasicBlock.h
llvm::cl::opt< bool >
llvm::BlockFrequency
Definition: BlockFrequency.h:23
BranchProbabilityInfo.h
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:475
uint64_t
ProfileSummaryInfo.h
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2648
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:173
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IPO.h
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2847
ProfDataUtils.h
llvm::DenseMap
Definition: DenseMap.h:714
MaxNumPartialInlining
static cl::opt< int > MaxNumPartialInlining("max-partial-inlining", cl::init(-1), cl::Hidden, cl::desc("Max number of partial inlining. The default is unlimited"))
I
#define I(x, y, z)
Definition: MD5.cpp:58
Cloning.h
llvm::succ_begin
Interval::succ_iterator succ_begin(Interval *I)
succ_begin/succ_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:99
llvm::BlockFrequency::getFrequency
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
Definition: BlockFrequency.h:34
MarkOutlinedColdCC
static cl::opt< bool > MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, cl::desc("Mark outline function calls with ColdCC"))
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:446
llvm::ProfileSummaryInfo::getColdCountThreshold
uint64_t getColdCountThreshold() const
Returns ColdCountThreshold if set.
Definition: ProfileSummaryInfo.h:177
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:194
llvm::InlineCost::getCostDelta
int getCostDelta() const
Get the cost delta from the threshold for inlining.
Definition: InlineCost.h:175
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1868
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
InlineCost.h
llvm::ProfileSummaryInfo::hasInstrumentationProfile
bool hasInstrumentationProfile() const
Returns true if module M has instrumentation profile.
Definition: ProfileSummaryInfo.h:78
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
llvm::InstructionCost::getValue
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition: InstructionCost.h:88
inliner
partial inliner
Definition: PartialInlining.cpp:1519
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
OtherSucc
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
Definition: ARMISelLowering.cpp:11119
DisablePartialInlining
static cl::opt< bool > DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial inlining"))
hasProfileData
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI)
Definition: PartialInlining.cpp:711
llvm::ProfileSummaryAnalysis
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:212
llvm::initializePartialInlinerLegacyPassPass
void initializePartialInlinerLegacyPassPass(PassRegistry &)
MaxNumInlineBlocks
static cl::opt< unsigned > MaxNumInlineBlocks("max-num-inline-blocks", cl::init(5), cl::Hidden, cl::desc("Max number of blocks to be partially inlined"))
llvm::DominatorTreeBase::recalculate
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
Definition: GenericDomTree.h:778
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::LoopInfo
Definition: LoopInfo.h:1108
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::Function::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:242
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:222
DP
So we should use XX3Form_Rcr to implement intrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store load store see def memrix16 in PPCInstrInfo td Load Store Vector load store outs ins lxsdx set load store with conversion from to DP
Definition: README_P9.txt:520
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
llvm::BranchProbability
Definition: BranchProbability.h:30
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::CodeExtractor
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
llvm::RegionInfo
Definition: RegionInfo.h:900
llvm::InstructionCost::isValid
bool isValid() const
Definition: InstructionCost.h:80
llvm::CloneFunction
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
Definition: CloneFunction.cpp:297
BlockFrequencyInfo.h
llvm::AMDGPUISD::BFI
@ BFI
Definition: AMDGPUISelLowering.h:432
llvm::OptimizationRemarkAnalysis
Diagnostic information for optimization analysis remarks.
Definition: DiagnosticInfo.h:780
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::ValueMap< const Value *, WeakTrackingVH >
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:187
ForceLiveExit
static cl::opt< bool > ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, cl::desc("Force outline regions with live exits"))
Attributes.h
llvm::AssumptionCacheTracker::getAssumptionCache
AssumptionCache & getAssumptionCache(Function &F)
Get the cached assumptions for a function.
Definition: AssumptionCache.cpp:285
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
llvm::ISD::BR
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:982
llvm::Function::back
const BasicBlock & back() const
Definition: Function.h:716
llvm::AnalysisManager::getCachedResult
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:793
Casting.h
DiagnosticInfo.h
Function.h
SkipCostAnalysis
static cl::opt< bool > SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::ReallyHidden, cl::desc("Skip Cost Analysis"))
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:226
llvm::getCallsiteCost
int getCallsiteCost(const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
Definition: InlineCost.cpp:2788
llvm::InlineFunctionInfo
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:203
Inliner
partial Partial Inliner
Definition: PartialInlining.cpp:1520
llvm::CodeExtractorAnalysisCache
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
llvm::ProfileSummaryInfo::isColdBlock
bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const
Returns true if BasicBlock BB is considered cold.
Definition: ProfileSummaryInfo.cpp:333
DisableMultiRegionPartialInline
static cl::opt< bool > DisableMultiRegionPartialInline("disable-mr-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable multi-region partial inlining"))
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:689
Instructions.h
SmallVector.h
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:359
User.h
llvm::BlockFrequencyInfo::getBlockProfileCount
Optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
Definition: BlockFrequencyInfo.cpp:208
Dominators.h
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
Users
iv Induction Variable Users
Definition: IVUsers.cpp:48
TargetTransformInfo.h
llvm::DenseMapBase::reserve
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
llvm::PHINode
Definition: Instructions.h:2697
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:119
llvm::ProfileSummaryInfo::getHotCountThreshold
uint64_t getHotCountThreshold() const
Returns HotCountThreshold if set.
Definition: ProfileSummaryInfo.h:173
llvm::MipsISD::Ins
@ Ins
Definition: MipsISelLowering.h:160
llvm::SmallVectorImpl< BasicBlock * >
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1174
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:42
llvm::createPartialInliningPass
ModulePass * createPartialInliningPass()
createPartialInliningPass - This pass inlines parts of functions.
Definition: PartialInlining.cpp:1522
llvm::InnerAnalysisManagerProxy
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:931
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
llvm::extractBranchWeights
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
Definition: ProfDataUtils.cpp:104
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) INITIALIZE_PASS_END(PartialInlinerLegacyPass
ExtraOutliningPenalty
static cl::opt< unsigned > ExtraOutliningPenalty("partial-inlining-extra-penalty", cl::init(0), cl::Hidden, cl::desc("A debug option to add additional penalty to the computed one."))
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::SwitchInst
Multiway switch.
Definition: Instructions.h:3276
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::Function::ProfileCount
Class to represent profile counts.
Definition: Function.h:253
llvm::cl::desc
Definition: CommandLine.h:412
llvm::Region
Definition: RegionInfo.h:889
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3132
llvm::SetVector< Value * >
llvm::pdb::PDB_SymType::Block
@ Block
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetLibraryAnalysis
Analysis pass providing the TargetLibraryInfo.
Definition: TargetLibraryInfo.h:450
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::CallBase::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: InstrTypes.h:1459
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:39