LLVM  10.0.0svn
SampleProfile.cpp
Go to the documentation of this file.
1 //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SampleProfileLoader transformation. This pass
10 // reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11 // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12 // profile information in the given profile.
13 //
14 // This pass generates branch weight annotations on the IR:
15 //
16 // - prof: Represents branch weights. This annotation is added to branches
17 // to indicate the weights of each edge coming out of the branch.
18 // The weight of each edge is the weight of the target block for
19 // that edge. The weight of a block B is computed as the maximum
20 // number of samples found in B.
21 //
22 //===----------------------------------------------------------------------===//
23 
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
28 #include "llvm/ADT/None.h"
29 #include "llvm/ADT/SmallPtrSet.h"
30 #include "llvm/ADT/SmallSet.h"
31 #include "llvm/ADT/SmallVector.h"
32 #include "llvm/ADT/StringMap.h"
33 #include "llvm/ADT/StringRef.h"
34 #include "llvm/ADT/Twine.h"
37 #include "llvm/Analysis/LoopInfo.h"
42 #include "llvm/IR/BasicBlock.h"
43 #include "llvm/IR/CFG.h"
44 #include "llvm/IR/CallSite.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Dominators.h"
49 #include "llvm/IR/Function.h"
50 #include "llvm/IR/GlobalValue.h"
51 #include "llvm/IR/InstrTypes.h"
52 #include "llvm/IR/Instruction.h"
53 #include "llvm/IR/Instructions.h"
54 #include "llvm/IR/IntrinsicInst.h"
55 #include "llvm/IR/LLVMContext.h"
56 #include "llvm/IR/MDBuilder.h"
57 #include "llvm/IR/Module.h"
58 #include "llvm/IR/PassManager.h"
60 #include "llvm/Pass.h"
64 #include "llvm/Support/Casting.h"
66 #include "llvm/Support/Debug.h"
68 #include "llvm/Support/ErrorOr.h"
71 #include "llvm/Transforms/IPO.h"
76 #include <algorithm>
77 #include <cassert>
78 #include <cstdint>
79 #include <functional>
80 #include <limits>
81 #include <map>
82 #include <memory>
83 #include <queue>
84 #include <string>
85 #include <system_error>
86 #include <utility>
87 #include <vector>
88 
89 using namespace llvm;
90 using namespace sampleprof;
92 #define DEBUG_TYPE "sample-profile"
93 
94 // Command line option to specify the file to read samples from. This is
95 // mainly used for debugging.
97  "sample-profile-file", cl::init(""), cl::value_desc("filename"),
98  cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
99 
100 // The named file contains a set of transformations that may have been applied
101 // to the symbol names between the program from which the sample data was
102 // collected and the current program's symbols.
104  "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
105  cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
106 
108  "sample-profile-max-propagate-iterations", cl::init(100),
109  cl::desc("Maximum number of iterations to go through when propagating "
110  "sample block/edge weights through the CFG."));
111 
113  "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"),
114  cl::desc("Emit a warning if less than N% of records in the input profile "
115  "are matched to the IR."));
116 
118  "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"),
119  cl::desc("Emit a warning if less than N% of samples in the input profile "
120  "are matched to the IR."));
121 
123  "no-warn-sample-unused", cl::init(false), cl::Hidden,
124  cl::desc("Use this option to turn off/on warnings about function with "
125  "samples but without debug information to use those samples. "));
126 
128  "profile-sample-accurate", cl::Hidden, cl::init(false),
129  cl::desc("If the sample profile is accurate, we will mark all un-sampled "
130  "callsite and function as having 0 samples. Otherwise, treat "
131  "un-sampled callsites and functions conservatively as unknown. "));
132 
134  "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
135  cl::init(true),
136  cl::desc("For symbols in profile symbol list, regard their profiles to "
137  "be accurate. It may be overriden by profile-sample-accurate. "));
138 
139 namespace {
140 
141 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
142 using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
143 using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
144 using EdgeWeightMap = DenseMap<Edge, uint64_t>;
145 using BlockEdgeMap =
147 
148 class SampleProfileLoader;
149 
150 class SampleCoverageTracker {
151 public:
152  SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){};
153 
154  bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
155  uint32_t Discriminator, uint64_t Samples);
156  unsigned computeCoverage(unsigned Used, unsigned Total) const;
157  unsigned countUsedRecords(const FunctionSamples *FS,
158  ProfileSummaryInfo *PSI) const;
159  unsigned countBodyRecords(const FunctionSamples *FS,
160  ProfileSummaryInfo *PSI) const;
161  uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
162  uint64_t countBodySamples(const FunctionSamples *FS,
163  ProfileSummaryInfo *PSI) const;
164 
165  void clear() {
166  SampleCoverage.clear();
167  TotalUsedSamples = 0;
168  }
169 
170 private:
171  using BodySampleCoverageMap = std::map<LineLocation, unsigned>;
172  using FunctionSamplesCoverageMap =
174 
175  /// Coverage map for sampling records.
176  ///
177  /// This map keeps a record of sampling records that have been matched to
178  /// an IR instruction. This is used to detect some form of staleness in
179  /// profiles (see flag -sample-profile-check-coverage).
180  ///
181  /// Each entry in the map corresponds to a FunctionSamples instance. This is
182  /// another map that counts how many times the sample record at the
183  /// given location has been used.
184  FunctionSamplesCoverageMap SampleCoverage;
185 
186  /// Number of samples used from the profile.
187  ///
188  /// When a sampling record is used for the first time, the samples from
189  /// that record are added to this accumulator. Coverage is later computed
190  /// based on the total number of samples available in this function and
191  /// its callsites.
192  ///
193  /// Note that this accumulator tracks samples used from a single function
194  /// and all the inlined callsites. Strictly, we should have a map of counters
195  /// keyed by FunctionSamples pointers, but these stats are cleared after
196  /// every function, so we just need to keep a single counter.
197  uint64_t TotalUsedSamples = 0;
198 
199  SampleProfileLoader &SPLoader;
200 };
201 
202 class GUIDToFuncNameMapper {
203 public:
204  GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
205  DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
206  : CurrentReader(Reader), CurrentModule(M),
207  CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
208  if (CurrentReader.getFormat() != SPF_Compact_Binary)
209  return;
210 
211  for (const auto &F : CurrentModule) {
212  StringRef OrigName = F.getName();
213  CurrentGUIDToFuncNameMap.insert(
214  {Function::getGUID(OrigName), OrigName});
215 
216  // Local to global var promotion used by optimization like thinlto
217  // will rename the var and add suffix like ".llvm.xxx" to the
218  // original local name. In sample profile, the suffixes of function
219  // names are all stripped. Since it is possible that the mapper is
220  // built in post-thin-link phase and var promotion has been done,
221  // we need to add the substring of function name without the suffix
222  // into the GUIDToFuncNameMap.
224  if (CanonName != OrigName)
225  CurrentGUIDToFuncNameMap.insert(
226  {Function::getGUID(CanonName), CanonName});
227  }
228 
229  // Update GUIDToFuncNameMap for each function including inlinees.
230  SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
231  }
232 
233  ~GUIDToFuncNameMapper() {
234  if (CurrentReader.getFormat() != SPF_Compact_Binary)
235  return;
236 
237  CurrentGUIDToFuncNameMap.clear();
238 
239  // Reset GUIDToFuncNameMap for of each function as they're no
240  // longer valid at this point.
241  SetGUIDToFuncNameMapForAll(nullptr);
242  }
243 
244 private:
245  void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
246  std::queue<FunctionSamples *> FSToUpdate;
247  for (auto &IFS : CurrentReader.getProfiles()) {
248  FSToUpdate.push(&IFS.second);
249  }
250 
251  while (!FSToUpdate.empty()) {
252  FunctionSamples *FS = FSToUpdate.front();
253  FSToUpdate.pop();
254  FS->GUIDToFuncNameMap = Map;
255  for (const auto &ICS : FS->getCallsiteSamples()) {
256  const FunctionSamplesMap &FSMap = ICS.second;
257  for (auto &IFS : FSMap) {
258  FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
259  FSToUpdate.push(&FS);
260  }
261  }
262  }
263  }
264 
265  SampleProfileReader &CurrentReader;
266  Module &CurrentModule;
267  DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
268 };
269 
270 /// Sample profile pass.
271 ///
272 /// This pass reads profile data from the file specified by
273 /// -sample-profile-file and annotates every affected function with the
274 /// profile information found in that file.
275 class SampleProfileLoader {
276 public:
277  SampleProfileLoader(
278  StringRef Name, StringRef RemapName, bool IsThinLTOPreLink,
279  std::function<AssumptionCache &(Function &)> GetAssumptionCache,
280  std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
281  : GetAC(std::move(GetAssumptionCache)),
282  GetTTI(std::move(GetTargetTransformInfo)), CoverageTracker(*this),
283  Filename(Name), RemappingFilename(RemapName),
284  IsThinLTOPreLink(IsThinLTOPreLink) {}
285 
286  bool doInitialization(Module &M);
287  bool runOnModule(Module &M, ModuleAnalysisManager *AM,
288  ProfileSummaryInfo *_PSI);
289 
290  void dump() { Reader->dump(); }
291 
292 protected:
293  friend class SampleCoverageTracker;
294 
296  unsigned getFunctionLoc(Function &F);
297  bool emitAnnotations(Function &F);
298  ErrorOr<uint64_t> getInstWeight(const Instruction &I);
299  ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
300  const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
301  std::vector<const FunctionSamples *>
302  findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
303  mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
304  const FunctionSamples *findFunctionSamples(const Instruction &I) const;
305  bool inlineCallInstruction(Instruction *I);
306  bool inlineHotFunctions(Function &F,
307  DenseSet<GlobalValue::GUID> &InlinedGUIDs);
308  void printEdgeWeight(raw_ostream &OS, Edge E);
309  void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
310  void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
311  bool computeBlockWeights(Function &F);
312  void findEquivalenceClasses(Function &F);
313  template <bool IsPostDom>
314  void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
316 
317  void propagateWeights(Function &F);
318  uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
319  void buildEdges(Function &F);
320  bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
321  void computeDominanceAndLoopInfo(Function &F);
322  void clearFunctionData();
323  bool callsiteIsHot(const FunctionSamples *CallsiteFS,
324  ProfileSummaryInfo *PSI);
325 
326  /// Map basic blocks to their computed weights.
327  ///
328  /// The weight of a basic block is defined to be the maximum
329  /// of all the instruction weights in that block.
330  BlockWeightMap BlockWeights;
331 
332  /// Map edges to their computed weights.
333  ///
334  /// Edge weights are computed by propagating basic block weights in
335  /// SampleProfile::propagateWeights.
336  EdgeWeightMap EdgeWeights;
337 
338  /// Set of visited blocks during propagation.
340 
341  /// Set of visited edges during propagation.
342  SmallSet<Edge, 32> VisitedEdges;
343 
344  /// Equivalence classes for block weights.
345  ///
346  /// Two blocks BB1 and BB2 are in the same equivalence class if they
347  /// dominate and post-dominate each other, and they are in the same loop
348  /// nest. When this happens, the two blocks are guaranteed to execute
349  /// the same number of times.
350  EquivalenceClassMap EquivalenceClass;
351 
352  /// Map from function name to Function *. Used to find the function from
353  /// the function name. If the function name contains suffix, additional
354  /// entry is added to map from the stripped name to the function if there
355  /// is one-to-one mapping.
357 
358  /// Dominance, post-dominance and loop information.
359  std::unique_ptr<DominatorTree> DT;
360  std::unique_ptr<PostDominatorTree> PDT;
361  std::unique_ptr<LoopInfo> LI;
362 
363  std::function<AssumptionCache &(Function &)> GetAC;
364  std::function<TargetTransformInfo &(Function &)> GetTTI;
365 
366  /// Predecessors for each basic block in the CFG.
367  BlockEdgeMap Predecessors;
368 
369  /// Successors for each basic block in the CFG.
370  BlockEdgeMap Successors;
371 
372  SampleCoverageTracker CoverageTracker;
373 
374  /// Profile reader object.
375  std::unique_ptr<SampleProfileReader> Reader;
376 
377  /// Samples collected for the body of this function.
378  FunctionSamples *Samples = nullptr;
379 
380  /// Name of the profile file to load.
381  std::string Filename;
382 
383  /// Name of the profile remapping file to load.
384  std::string RemappingFilename;
385 
386  /// Flag indicating whether the profile input loaded successfully.
387  bool ProfileIsValid = false;
388 
389  /// Flag indicating if the pass is invoked in ThinLTO compile phase.
390  ///
391  /// In this phase, in annotation, we should not promote indirect calls.
392  /// Instead, we will mark GUIDs that needs to be annotated to the function.
393  bool IsThinLTOPreLink;
394 
395  /// Profile Summary Info computed from sample profile.
396  ProfileSummaryInfo *PSI = nullptr;
397 
398  /// Profle Symbol list tells whether a function name appears in the binary
399  /// used to generate the current profile.
400  std::unique_ptr<ProfileSymbolList> PSL;
401 
402  /// Total number of samples collected in this profile.
403  ///
404  /// This is the sum of all the samples collected in all the functions executed
405  /// at runtime.
406  uint64_t TotalCollectedSamples = 0;
407 
408  /// Optimization Remark Emitter used to emit diagnostic remarks.
409  OptimizationRemarkEmitter *ORE = nullptr;
410 
411  // Information recorded when we declined to inline a call site
412  // because we have determined it is too cold is accumulated for
413  // each callee function. Initially this is just the entry count.
414  struct NotInlinedProfileInfo {
415  uint64_t entryCount;
416  };
418 
419  // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
420  // all the function symbols defined or declared in current module.
421  DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
422 
423  // All the Names used in FunctionSamples including outline function
424  // names, inline instance names and call target names.
425  StringSet<> NamesInProfile;
426 
427  // For symbol in profile symbol list, whether to regard their profiles
428  // to be accurate. It is mainly decided by existance of profile symbol
429  // list and -profile-accurate-for-symsinlist flag, but it can be
430  // overriden by -profile-sample-accurate or profile-sample-accurate
431  // attribute.
432  bool ProfAccForSymsInList;
433 };
434 
435 class SampleProfileLoaderLegacyPass : public ModulePass {
436 public:
437  // Class identification, replacement for typeinfo
438  static char ID;
439 
440  SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile,
441  bool IsThinLTOPreLink = false)
442  : ModulePass(ID),
443  SampleLoader(Name, SampleProfileRemappingFile, IsThinLTOPreLink,
444  [&](Function &F) -> AssumptionCache & {
445  return ACT->getAssumptionCache(F);
446  },
447  [&](Function &F) -> TargetTransformInfo & {
448  return TTIWP->getTTI(F);
449  }) {
452  }
453 
454  void dump() { SampleLoader.dump(); }
455 
456  bool doInitialization(Module &M) override {
457  return SampleLoader.doInitialization(M);
458  }
459 
460  StringRef getPassName() const override { return "Sample profile pass"; }
461  bool runOnModule(Module &M) override;
462 
463  void getAnalysisUsage(AnalysisUsage &AU) const override {
467  }
468 
469 private:
470  SampleProfileLoader SampleLoader;
471  AssumptionCacheTracker *ACT = nullptr;
472  TargetTransformInfoWrapperPass *TTIWP = nullptr;
473 };
474 
475 } // end anonymous namespace
476 
477 /// Return true if the given callsite is hot wrt to hot cutoff threshold.
478 ///
479 /// Functions that were inlined in the original binary will be represented
480 /// in the inline stack in the sample profile. If the profile shows that
481 /// the original inline decision was "good" (i.e., the callsite is executed
482 /// frequently), then we will recreate the inline decision and apply the
483 /// profile from the inlined callsite.
484 ///
485 /// To decide whether an inlined callsite is hot, we compare the callsite
486 /// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
487 /// regarded as hot if the count is above the cutoff value.
488 ///
489 /// When ProfileAccurateForSymsInList is enabled and profile symbol list
490 /// is present, functions in the profile symbol list but without profile will
491 /// be regarded as cold and much less inlining will happen in CGSCC inlining
492 /// pass, so we tend to lower the hot criteria here to allow more early
493 /// inlining to happen for warm callsites and it is helpful for performance.
494 bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS,
495  ProfileSummaryInfo *PSI) {
496  if (!CallsiteFS)
497  return false; // The callsite was not inlined in the original binary.
498 
499  assert(PSI && "PSI is expected to be non null");
500  uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
501  if (ProfAccForSymsInList)
502  return !PSI->isColdCount(CallsiteTotalSamples);
503  else
504  return PSI->isHotCount(CallsiteTotalSamples);
505 }
506 
507 /// Mark as used the sample record for the given function samples at
508 /// (LineOffset, Discriminator).
509 ///
510 /// \returns true if this is the first time we mark the given record.
511 bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
512  uint32_t LineOffset,
513  uint32_t Discriminator,
514  uint64_t Samples) {
515  LineLocation Loc(LineOffset, Discriminator);
516  unsigned &Count = SampleCoverage[FS][Loc];
517  bool FirstTime = (++Count == 1);
518  if (FirstTime)
519  TotalUsedSamples += Samples;
520  return FirstTime;
521 }
522 
523 /// Return the number of sample records that were applied from this profile.
524 ///
525 /// This count does not include records from cold inlined callsites.
526 unsigned
527 SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
528  ProfileSummaryInfo *PSI) const {
529  auto I = SampleCoverage.find(FS);
530 
531  // The size of the coverage map for FS represents the number of records
532  // that were marked used at least once.
533  unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0;
534 
535  // If there are inlined callsites in this function, count the samples found
536  // in the respective bodies. However, do not bother counting callees with 0
537  // total samples, these are callees that were never invoked at runtime.
538  for (const auto &I : FS->getCallsiteSamples())
539  for (const auto &J : I.second) {
540  const FunctionSamples *CalleeSamples = &J.second;
541  if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
542  Count += countUsedRecords(CalleeSamples, PSI);
543  }
544 
545  return Count;
546 }
547 
548 /// Return the number of sample records in the body of this profile.
549 ///
550 /// This count does not include records from cold inlined callsites.
551 unsigned
552 SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
553  ProfileSummaryInfo *PSI) const {
554  unsigned Count = FS->getBodySamples().size();
555 
556  // Only count records in hot callsites.
557  for (const auto &I : FS->getCallsiteSamples())
558  for (const auto &J : I.second) {
559  const FunctionSamples *CalleeSamples = &J.second;
560  if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
561  Count += countBodyRecords(CalleeSamples, PSI);
562  }
563 
564  return Count;
565 }
566 
567 /// Return the number of samples collected in the body of this profile.
568 ///
569 /// This count does not include samples from cold inlined callsites.
570 uint64_t
571 SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
572  ProfileSummaryInfo *PSI) const {
573  uint64_t Total = 0;
574  for (const auto &I : FS->getBodySamples())
575  Total += I.second.getSamples();
576 
577  // Only count samples in hot callsites.
578  for (const auto &I : FS->getCallsiteSamples())
579  for (const auto &J : I.second) {
580  const FunctionSamples *CalleeSamples = &J.second;
581  if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
582  Total += countBodySamples(CalleeSamples, PSI);
583  }
584 
585  return Total;
586 }
587 
588 /// Return the fraction of sample records used in this profile.
589 ///
590 /// The returned value is an unsigned integer in the range 0-100 indicating
591 /// the percentage of sample records that were used while applying this
592 /// profile to the associated function.
593 unsigned SampleCoverageTracker::computeCoverage(unsigned Used,
594  unsigned Total) const {
595  assert(Used <= Total &&
596  "number of used records cannot exceed the total number of records");
597  return Total > 0 ? Used * 100 / Total : 100;
598 }
599 
600 /// Clear all the per-function data used to load samples and propagate weights.
601 void SampleProfileLoader::clearFunctionData() {
602  BlockWeights.clear();
603  EdgeWeights.clear();
604  VisitedBlocks.clear();
605  VisitedEdges.clear();
606  EquivalenceClass.clear();
607  DT = nullptr;
608  PDT = nullptr;
609  LI = nullptr;
610  Predecessors.clear();
611  Successors.clear();
612  CoverageTracker.clear();
613 }
614 
615 #ifndef NDEBUG
616 /// Print the weight of edge \p E on stream \p OS.
617 ///
618 /// \param OS Stream to emit the output to.
619 /// \param E Edge to print.
620 void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
621  OS << "weight[" << E.first->getName() << "->" << E.second->getName()
622  << "]: " << EdgeWeights[E] << "\n";
623 }
624 
625 /// Print the equivalence class of block \p BB on stream \p OS.
626 ///
627 /// \param OS Stream to emit the output to.
628 /// \param BB Block to print.
629 void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
630  const BasicBlock *BB) {
631  const BasicBlock *Equiv = EquivalenceClass[BB];
632  OS << "equivalence[" << BB->getName()
633  << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
634 }
635 
636 /// Print the weight of block \p BB on stream \p OS.
637 ///
638 /// \param OS Stream to emit the output to.
639 /// \param BB Block to print.
640 void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
641  const BasicBlock *BB) const {
642  const auto &I = BlockWeights.find(BB);
643  uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
644  OS << "weight[" << BB->getName() << "]: " << W << "\n";
645 }
646 #endif
647 
648 /// Get the weight for an instruction.
649 ///
650 /// The "weight" of an instruction \p Inst is the number of samples
651 /// collected on that instruction at runtime. To retrieve it, we
652 /// need to compute the line number of \p Inst relative to the start of its
653 /// function. We use HeaderLineno to compute the offset. We then
654 /// look up the samples collected for \p Inst using BodySamples.
655 ///
656 /// \param Inst Instruction to query.
657 ///
658 /// \returns the weight of \p Inst.
659 ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
660  const DebugLoc &DLoc = Inst.getDebugLoc();
661  if (!DLoc)
662  return std::error_code();
663 
664  const FunctionSamples *FS = findFunctionSamples(Inst);
665  if (!FS)
666  return std::error_code();
667 
668  // Ignore all intrinsics, phinodes and branch instructions.
669  // Branch and phinodes instruction usually contains debug info from sources outside of
670  // the residing basic block, thus we ignore them during annotation.
671  if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
672  return std::error_code();
673 
674  // If a direct call/invoke instruction is inlined in profile
675  // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
676  // it means that the inlined callsite has no sample, thus the call
677  // instruction should have 0 count.
678  if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
679  !ImmutableCallSite(&Inst).isIndirectCall() &&
680  findCalleeFunctionSamples(Inst))
681  return 0;
682 
683  const DILocation *DIL = DLoc;
684  uint32_t LineOffset = FunctionSamples::getOffset(DIL);
685  uint32_t Discriminator = DIL->getBaseDiscriminator();
686  ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
687  if (R) {
688  bool FirstMark =
689  CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
690  if (FirstMark) {
691  ORE->emit([&]() {
692  OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
693  Remark << "Applied " << ore::NV("NumSamples", *R);
694  Remark << " samples from profile (offset: ";
695  Remark << ore::NV("LineOffset", LineOffset);
696  if (Discriminator) {
697  Remark << ".";
698  Remark << ore::NV("Discriminator", Discriminator);
699  }
700  Remark << ")";
701  return Remark;
702  });
703  }
704  LLVM_DEBUG(dbgs() << " " << DLoc.getLine() << "."
705  << DIL->getBaseDiscriminator() << ":" << Inst
706  << " (line offset: " << LineOffset << "."
707  << DIL->getBaseDiscriminator() << " - weight: " << R.get()
708  << ")\n");
709  }
710  return R;
711 }
712 
713 /// Compute the weight of a basic block.
714 ///
715 /// The weight of basic block \p BB is the maximum weight of all the
716 /// instructions in BB.
717 ///
718 /// \param BB The basic block to query.
719 ///
720 /// \returns the weight for \p BB.
721 ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
722  uint64_t Max = 0;
723  bool HasWeight = false;
724  for (auto &I : BB->getInstList()) {
725  const ErrorOr<uint64_t> &R = getInstWeight(I);
726  if (R) {
727  Max = std::max(Max, R.get());
728  HasWeight = true;
729  }
730  }
731  return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
732 }
733 
734 /// Compute and store the weights of every basic block.
735 ///
736 /// This populates the BlockWeights map by computing
737 /// the weights of every basic block in the CFG.
738 ///
739 /// \param F The function to query.
740 bool SampleProfileLoader::computeBlockWeights(Function &F) {
741  bool Changed = false;
742  LLVM_DEBUG(dbgs() << "Block weights\n");
743  for (const auto &BB : F) {
744  ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
745  if (Weight) {
746  BlockWeights[&BB] = Weight.get();
747  VisitedBlocks.insert(&BB);
748  Changed = true;
749  }
750  LLVM_DEBUG(printBlockWeight(dbgs(), &BB));
751  }
752 
753  return Changed;
754 }
755 
756 /// Get the FunctionSamples for a call instruction.
757 ///
758 /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
759 /// instance in which that call instruction is calling to. It contains
760 /// all samples that resides in the inlined instance. We first find the
761 /// inlined instance in which the call instruction is from, then we
762 /// traverse its children to find the callsite with the matching
763 /// location.
764 ///
765 /// \param Inst Call/Invoke instruction to query.
766 ///
767 /// \returns The FunctionSamples pointer to the inlined instance.
768 const FunctionSamples *
769 SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
770  const DILocation *DIL = Inst.getDebugLoc();
771  if (!DIL) {
772  return nullptr;
773  }
774 
775  StringRef CalleeName;
776  if (const CallInst *CI = dyn_cast<CallInst>(&Inst))
777  if (Function *Callee = CI->getCalledFunction())
778  CalleeName = Callee->getName();
779 
780  const FunctionSamples *FS = findFunctionSamples(Inst);
781  if (FS == nullptr)
782  return nullptr;
783 
785  DIL->getBaseDiscriminator()),
786  CalleeName);
787 }
788 
789 /// Returns a vector of FunctionSamples that are the indirect call targets
790 /// of \p Inst. The vector is sorted by the total number of samples. Stores
791 /// the total call count of the indirect call in \p Sum.
792 std::vector<const FunctionSamples *>
793 SampleProfileLoader::findIndirectCallFunctionSamples(
794  const Instruction &Inst, uint64_t &Sum) const {
795  const DILocation *DIL = Inst.getDebugLoc();
796  std::vector<const FunctionSamples *> R;
797 
798  if (!DIL) {
799  return R;
800  }
801 
802  const FunctionSamples *FS = findFunctionSamples(Inst);
803  if (FS == nullptr)
804  return R;
805 
806  uint32_t LineOffset = FunctionSamples::getOffset(DIL);
807  uint32_t Discriminator = DIL->getBaseDiscriminator();
808 
809  auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
810  Sum = 0;
811  if (T)
812  for (const auto &T_C : T.get())
813  Sum += T_C.second;
816  if (M->empty())
817  return R;
818  for (const auto &NameFS : *M) {
819  Sum += NameFS.second.getEntrySamples();
820  R.push_back(&NameFS.second);
821  }
822  llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) {
823  if (L->getEntrySamples() != R->getEntrySamples())
824  return L->getEntrySamples() > R->getEntrySamples();
825  return FunctionSamples::getGUID(L->getName()) <
827  });
828  }
829  return R;
830 }
831 
832 /// Get the FunctionSamples for an instruction.
833 ///
834 /// The FunctionSamples of an instruction \p Inst is the inlined instance
835 /// in which that instruction is coming from. We traverse the inline stack
836 /// of that instruction, and match it with the tree nodes in the profile.
837 ///
838 /// \param Inst Instruction to query.
839 ///
840 /// \returns the FunctionSamples pointer to the inlined instance.
841 const FunctionSamples *
842 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
843  const DILocation *DIL = Inst.getDebugLoc();
844  if (!DIL)
845  return Samples;
846 
847  auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
848  if (it.second)
849  it.first->second = Samples->findFunctionSamples(DIL);
850  return it.first->second;
851 }
852 
853 bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
854  assert(isa<CallInst>(I) || isa<InvokeInst>(I));
855  CallSite CS(I);
856  Function *CalledFunction = CS.getCalledFunction();
857  assert(CalledFunction);
858  DebugLoc DLoc = I->getDebugLoc();
859  BasicBlock *BB = I->getParent();
860  InlineParams Params = getInlineParams();
861  Params.ComputeFullInlineCost = true;
862  // Checks if there is anything in the reachable portion of the callee at
863  // this callsite that makes this inlining potentially illegal. Need to
864  // set ComputeFullInlineCost, otherwise getInlineCost may return early
865  // when cost exceeds threshold without checking all IRs in the callee.
866  // The acutal cost does not matter because we only checks isNever() to
867  // see if it is legal to inline the callsite.
868  InlineCost Cost =
869  getInlineCost(cast<CallBase>(*I), Params, GetTTI(*CalledFunction), GetAC,
870  None, nullptr, nullptr);
871  if (Cost.isNever()) {
872  ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB)
873  << "incompatible inlining");
874  return false;
875  }
876  InlineFunctionInfo IFI(nullptr, &GetAC);
877  if (InlineFunction(CS, IFI)) {
878  // The call to InlineFunction erases I, so we can't pass it here.
879  ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB)
880  << "inlined hot callee '" << ore::NV("Callee", CalledFunction)
881  << "' into '" << ore::NV("Caller", BB->getParent()) << "'");
882  return true;
883  }
884  return false;
885 }
886 
887 /// Iteratively inline hot callsites of a function.
888 ///
889 /// Iteratively traverse all callsites of the function \p F, and find if
890 /// the corresponding inlined instance exists and is hot in profile. If
891 /// it is hot enough, inline the callsites and adds new callsites of the
892 /// callee into the caller. If the call is an indirect call, first promote
893 /// it to direct call. Each indirect call is limited with a single target.
894 ///
895 /// \param F function to perform iterative inlining.
896 /// \param InlinedGUIDs a set to be updated to include all GUIDs that are
897 /// inlined in the profiled binary.
898 ///
899 /// \returns True if there is any inline happened.
900 bool SampleProfileLoader::inlineHotFunctions(
901  Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
902  DenseSet<Instruction *> PromotedInsns;
903 
904  // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
905  // Profile symbol list is ignored when profile-sample-accurate is on.
906  assert((!ProfAccForSymsInList ||
908  !F.hasFnAttribute("profile-sample-accurate"))) &&
909  "ProfAccForSymsInList should be false when profile-sample-accurate "
910  "is enabled");
911 
912  DenseMap<Instruction *, const FunctionSamples *> localNotInlinedCallSites;
913  bool Changed = false;
914  while (true) {
915  bool LocalChanged = false;
917  for (auto &BB : F) {
918  bool Hot = false;
920  for (auto &I : BB.getInstList()) {
921  const FunctionSamples *FS = nullptr;
922  if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
923  !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
924  Candidates.push_back(&I);
925  if (FS->getEntrySamples() > 0)
926  localNotInlinedCallSites.try_emplace(&I, FS);
927  if (callsiteIsHot(FS, PSI))
928  Hot = true;
929  }
930  }
931  if (Hot) {
932  CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end());
933  }
934  }
935  for (auto I : CIS) {
936  Function *CalledFunction = CallSite(I).getCalledFunction();
937  // Do not inline recursive calls.
938  if (CalledFunction == &F)
939  continue;
940  if (CallSite(I).isIndirectCall()) {
941  if (PromotedInsns.count(I))
942  continue;
943  uint64_t Sum;
944  for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
945  if (IsThinLTOPreLink) {
946  FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
948  continue;
949  }
950  auto CalleeFunctionName = FS->getFuncNameInModule(F.getParent());
951  // If it is a recursive call, we do not inline it as it could bloat
952  // the code exponentially. There is way to better handle this, e.g.
953  // clone the caller first, and inline the cloned caller if it is
954  // recursive. As llvm does not inline recursive calls, we will
955  // simply ignore it instead of handling it explicitly.
956  if (CalleeFunctionName == F.getName())
957  continue;
958 
959  if (!callsiteIsHot(FS, PSI))
960  continue;
961 
962  const char *Reason = "Callee function not available";
963  auto R = SymbolMap.find(CalleeFunctionName);
964  if (R != SymbolMap.end() && R->getValue() &&
965  !R->getValue()->isDeclaration() &&
966  R->getValue()->getSubprogram() &&
967  isLegalToPromote(CallSite(I), R->getValue(), &Reason)) {
968  uint64_t C = FS->getEntrySamples();
969  Instruction *DI =
970  pgo::promoteIndirectCall(I, R->getValue(), C, Sum, false, ORE);
971  Sum -= C;
972  PromotedInsns.insert(I);
973  // If profile mismatches, we should not attempt to inline DI.
974  if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
975  inlineCallInstruction(DI)) {
976  localNotInlinedCallSites.erase(I);
977  LocalChanged = true;
978  }
979  } else {
980  LLVM_DEBUG(dbgs()
981  << "\nFailed to promote indirect call to "
982  << CalleeFunctionName << " because " << Reason << "\n");
983  }
984  }
985  } else if (CalledFunction && CalledFunction->getSubprogram() &&
986  !CalledFunction->isDeclaration()) {
987  if (inlineCallInstruction(I)) {
988  localNotInlinedCallSites.erase(I);
989  LocalChanged = true;
990  }
991  } else if (IsThinLTOPreLink) {
992  findCalleeFunctionSamples(*I)->findInlinedFunctions(
993  InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
994  }
995  }
996  if (LocalChanged) {
997  Changed = true;
998  } else {
999  break;
1000  }
1001  }
1002 
1003  // Accumulate not inlined callsite information into notInlinedSamples
1004  for (const auto &Pair : localNotInlinedCallSites) {
1005  Instruction *I = Pair.getFirst();
1007  if (!Callee || Callee->isDeclaration())
1008  continue;
1009  const FunctionSamples *FS = Pair.getSecond();
1010  auto pair =
1011  notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
1012  pair.first->second.entryCount += FS->getEntrySamples();
1013  }
1014  return Changed;
1015 }
1016 
1017 /// Find equivalence classes for the given block.
1018 ///
1019 /// This finds all the blocks that are guaranteed to execute the same
1020 /// number of times as \p BB1. To do this, it traverses all the
1021 /// descendants of \p BB1 in the dominator or post-dominator tree.
1022 ///
1023 /// A block BB2 will be in the same equivalence class as \p BB1 if
1024 /// the following holds:
1025 ///
1026 /// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2
1027 /// is a descendant of \p BB1 in the dominator tree, then BB2 should
1028 /// dominate BB1 in the post-dominator tree.
1029 ///
1030 /// 2- Both BB2 and \p BB1 must be in the same loop.
1031 ///
1032 /// For every block BB2 that meets those two requirements, we set BB2's
1033 /// equivalence class to \p BB1.
1034 ///
1035 /// \param BB1 Block to check.
1036 /// \param Descendants Descendants of \p BB1 in either the dom or pdom tree.
1037 /// \param DomTree Opposite dominator tree. If \p Descendants is filled
1038 /// with blocks from \p BB1's dominator tree, then
1039 /// this is the post-dominator tree, and vice versa.
1040 template <bool IsPostDom>
1041 void SampleProfileLoader::findEquivalencesFor(
1042  BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
1044  const BasicBlock *EC = EquivalenceClass[BB1];
1045  uint64_t Weight = BlockWeights[EC];
1046  for (const auto *BB2 : Descendants) {
1047  bool IsDomParent = DomTree->dominates(BB2, BB1);
1048  bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
1049  if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
1050  EquivalenceClass[BB2] = EC;
1051  // If BB2 is visited, then the entire EC should be marked as visited.
1052  if (VisitedBlocks.count(BB2)) {
1053  VisitedBlocks.insert(EC);
1054  }
1055 
1056  // If BB2 is heavier than BB1, make BB2 have the same weight
1057  // as BB1.
1058  //
1059  // Note that we don't worry about the opposite situation here
1060  // (when BB2 is lighter than BB1). We will deal with this
1061  // during the propagation phase. Right now, we just want to
1062  // make sure that BB1 has the largest weight of all the
1063  // members of its equivalence set.
1064  Weight = std::max(Weight, BlockWeights[BB2]);
1065  }
1066  }
1067  if (EC == &EC->getParent()->getEntryBlock()) {
1068  BlockWeights[EC] = Samples->getHeadSamples() + 1;
1069  } else {
1070  BlockWeights[EC] = Weight;
1071  }
1072 }
1073 
1074 /// Find equivalence classes.
1075 ///
1076 /// Since samples may be missing from blocks, we can fill in the gaps by setting
1077 /// the weights of all the blocks in the same equivalence class to the same
1078 /// weight. To compute the concept of equivalence, we use dominance and loop
1079 /// information. Two blocks B1 and B2 are in the same equivalence class if B1
1080 /// dominates B2, B2 post-dominates B1 and both are in the same loop.
1081 ///
1082 /// \param F The function to query.
1083 void SampleProfileLoader::findEquivalenceClasses(Function &F) {
1084  SmallVector<BasicBlock *, 8> DominatedBBs;
1085  LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n");
1086  // Find equivalence sets based on dominance and post-dominance information.
1087  for (auto &BB : F) {
1088  BasicBlock *BB1 = &BB;
1089 
1090  // Compute BB1's equivalence class once.
1091  if (EquivalenceClass.count(BB1)) {
1092  LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
1093  continue;
1094  }
1095 
1096  // By default, blocks are in their own equivalence class.
1097  EquivalenceClass[BB1] = BB1;
1098 
1099  // Traverse all the blocks dominated by BB1. We are looking for
1100  // every basic block BB2 such that:
1101  //
1102  // 1- BB1 dominates BB2.
1103  // 2- BB2 post-dominates BB1.
1104  // 3- BB1 and BB2 are in the same loop nest.
1105  //
1106  // If all those conditions hold, it means that BB2 is executed
1107  // as many times as BB1, so they are placed in the same equivalence
1108  // class by making BB2's equivalence class be BB1.
1109  DominatedBBs.clear();
1110  DT->getDescendants(BB1, DominatedBBs);
1111  findEquivalencesFor(BB1, DominatedBBs, PDT.get());
1112 
1113  LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
1114  }
1115 
1116  // Assign weights to equivalence classes.
1117  //
1118  // All the basic blocks in the same equivalence class will execute
1119  // the same number of times. Since we know that the head block in
1120  // each equivalence class has the largest weight, assign that weight
1121  // to all the blocks in that equivalence class.
1122  LLVM_DEBUG(
1123  dbgs() << "\nAssign the same weight to all blocks in the same class\n");
1124  for (auto &BI : F) {
1125  const BasicBlock *BB = &BI;
1126  const BasicBlock *EquivBB = EquivalenceClass[BB];
1127  if (BB != EquivBB)
1128  BlockWeights[BB] = BlockWeights[EquivBB];
1129  LLVM_DEBUG(printBlockWeight(dbgs(), BB));
1130  }
1131 }
1132 
1133 /// Visit the given edge to decide if it has a valid weight.
1134 ///
1135 /// If \p E has not been visited before, we copy to \p UnknownEdge
1136 /// and increment the count of unknown edges.
1137 ///
1138 /// \param E Edge to visit.
1139 /// \param NumUnknownEdges Current number of unknown edges.
1140 /// \param UnknownEdge Set if E has not been visited before.
1141 ///
1142 /// \returns E's weight, if known. Otherwise, return 0.
1143 uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
1144  Edge *UnknownEdge) {
1145  if (!VisitedEdges.count(E)) {
1146  (*NumUnknownEdges)++;
1147  *UnknownEdge = E;
1148  return 0;
1149  }
1150 
1151  return EdgeWeights[E];
1152 }
1153 
1154 /// Propagate weights through incoming/outgoing edges.
1155 ///
1156 /// If the weight of a basic block is known, and there is only one edge
1157 /// with an unknown weight, we can calculate the weight of that edge.
1158 ///
1159 /// Similarly, if all the edges have a known count, we can calculate the
1160 /// count of the basic block, if needed.
1161 ///
1162 /// \param F Function to process.
1163 /// \param UpdateBlockCount Whether we should update basic block counts that
1164 /// has already been annotated.
1165 ///
1166 /// \returns True if new weights were assigned to edges or blocks.
1167 bool SampleProfileLoader::propagateThroughEdges(Function &F,
1168  bool UpdateBlockCount) {
1169  bool Changed = false;
1170  LLVM_DEBUG(dbgs() << "\nPropagation through edges\n");
1171  for (const auto &BI : F) {
1172  const BasicBlock *BB = &BI;
1173  const BasicBlock *EC = EquivalenceClass[BB];
1174 
1175  // Visit all the predecessor and successor edges to determine
1176  // which ones have a weight assigned already. Note that it doesn't
1177  // matter that we only keep track of a single unknown edge. The
1178  // only case we are interested in handling is when only a single
1179  // edge is unknown (see setEdgeOrBlockWeight).
1180  for (unsigned i = 0; i < 2; i++) {
1181  uint64_t TotalWeight = 0;
1182  unsigned NumUnknownEdges = 0, NumTotalEdges = 0;
1183  Edge UnknownEdge, SelfReferentialEdge, SingleEdge;
1184 
1185  if (i == 0) {
1186  // First, visit all predecessor edges.
1187  NumTotalEdges = Predecessors[BB].size();
1188  for (auto *Pred : Predecessors[BB]) {
1189  Edge E = std::make_pair(Pred, BB);
1190  TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
1191  if (E.first == E.second)
1192  SelfReferentialEdge = E;
1193  }
1194  if (NumTotalEdges == 1) {
1195  SingleEdge = std::make_pair(Predecessors[BB][0], BB);
1196  }
1197  } else {
1198  // On the second round, visit all successor edges.
1199  NumTotalEdges = Successors[BB].size();
1200  for (auto *Succ : Successors[BB]) {
1201  Edge E = std::make_pair(BB, Succ);
1202  TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
1203  }
1204  if (NumTotalEdges == 1) {
1205  SingleEdge = std::make_pair(BB, Successors[BB][0]);
1206  }
1207  }
1208 
1209  // After visiting all the edges, there are three cases that we
1210  // can handle immediately:
1211  //
1212  // - All the edge weights are known (i.e., NumUnknownEdges == 0).
1213  // In this case, we simply check that the sum of all the edges
1214  // is the same as BB's weight. If not, we change BB's weight
1215  // to match. Additionally, if BB had not been visited before,
1216  // we mark it visited.
1217  //
1218  // - Only one edge is unknown and BB has already been visited.
1219  // In this case, we can compute the weight of the edge by
1220  // subtracting the total block weight from all the known
1221  // edge weights. If the edges weight more than BB, then the
1222  // edge of the last remaining edge is set to zero.
1223  //
1224  // - There exists a self-referential edge and the weight of BB is
1225  // known. In this case, this edge can be based on BB's weight.
1226  // We add up all the other known edges and set the weight on
1227  // the self-referential edge as we did in the previous case.
1228  //
1229  // In any other case, we must continue iterating. Eventually,
1230  // all edges will get a weight, or iteration will stop when
1231  // it reaches SampleProfileMaxPropagateIterations.
1232  if (NumUnknownEdges <= 1) {
1233  uint64_t &BBWeight = BlockWeights[EC];
1234  if (NumUnknownEdges == 0) {
1235  if (!VisitedBlocks.count(EC)) {
1236  // If we already know the weight of all edges, the weight of the
1237  // basic block can be computed. It should be no larger than the sum
1238  // of all edge weights.
1239  if (TotalWeight > BBWeight) {
1240  BBWeight = TotalWeight;
1241  Changed = true;
1242  LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName()
1243  << " known. Set weight for block: ";
1244  printBlockWeight(dbgs(), BB););
1245  }
1246  } else if (NumTotalEdges == 1 &&
1247  EdgeWeights[SingleEdge] < BlockWeights[EC]) {
1248  // If there is only one edge for the visited basic block, use the
1249  // block weight to adjust edge weight if edge weight is smaller.
1250  EdgeWeights[SingleEdge] = BlockWeights[EC];
1251  Changed = true;
1252  }
1253  } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
1254  // If there is a single unknown edge and the block has been
1255  // visited, then we can compute E's weight.
1256  if (BBWeight >= TotalWeight)
1257  EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
1258  else
1259  EdgeWeights[UnknownEdge] = 0;
1260  const BasicBlock *OtherEC;
1261  if (i == 0)
1262  OtherEC = EquivalenceClass[UnknownEdge.first];
1263  else
1264  OtherEC = EquivalenceClass[UnknownEdge.second];
1265  // Edge weights should never exceed the BB weights it connects.
1266  if (VisitedBlocks.count(OtherEC) &&
1267  EdgeWeights[UnknownEdge] > BlockWeights[OtherEC])
1268  EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
1269  VisitedEdges.insert(UnknownEdge);
1270  Changed = true;
1271  LLVM_DEBUG(dbgs() << "Set weight for edge: ";
1272  printEdgeWeight(dbgs(), UnknownEdge));
1273  }
1274  } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
1275  // If a block Weights 0, all its in/out edges should weight 0.
1276  if (i == 0) {
1277  for (auto *Pred : Predecessors[BB]) {
1278  Edge E = std::make_pair(Pred, BB);
1279  EdgeWeights[E] = 0;
1280  VisitedEdges.insert(E);
1281  }
1282  } else {
1283  for (auto *Succ : Successors[BB]) {
1284  Edge E = std::make_pair(BB, Succ);
1285  EdgeWeights[E] = 0;
1286  VisitedEdges.insert(E);
1287  }
1288  }
1289  } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
1290  uint64_t &BBWeight = BlockWeights[BB];
1291  // We have a self-referential edge and the weight of BB is known.
1292  if (BBWeight >= TotalWeight)
1293  EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
1294  else
1295  EdgeWeights[SelfReferentialEdge] = 0;
1296  VisitedEdges.insert(SelfReferentialEdge);
1297  Changed = true;
1298  LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: ";
1299  printEdgeWeight(dbgs(), SelfReferentialEdge));
1300  }
1301  if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
1302  BlockWeights[EC] = TotalWeight;
1303  VisitedBlocks.insert(EC);
1304  Changed = true;
1305  }
1306  }
1307  }
1308 
1309  return Changed;
1310 }
1311 
1312 /// Build in/out edge lists for each basic block in the CFG.
1313 ///
1314 /// We are interested in unique edges. If a block B1 has multiple
1315 /// edges to another block B2, we only add a single B1->B2 edge.
1316 void SampleProfileLoader::buildEdges(Function &F) {
1317  for (auto &BI : F) {
1318  BasicBlock *B1 = &BI;
1319 
1320  // Add predecessors for B1.
1322  if (!Predecessors[B1].empty())
1323  llvm_unreachable("Found a stale predecessors list in a basic block.");
1324  for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
1325  BasicBlock *B2 = *PI;
1326  if (Visited.insert(B2).second)
1327  Predecessors[B1].push_back(B2);
1328  }
1329 
1330  // Add successors for B1.
1331  Visited.clear();
1332  if (!Successors[B1].empty())
1333  llvm_unreachable("Found a stale successors list in a basic block.");
1334  for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
1335  BasicBlock *B2 = *SI;
1336  if (Visited.insert(B2).second)
1337  Successors[B1].push_back(B2);
1338  }
1339  }
1340 }
1341 
1342 /// Returns the sorted CallTargetMap \p M by count in descending order.
1344  const SampleRecord::CallTargetMap & M) {
1346  for (const auto &I : SampleRecord::SortCallTargets(M)) {
1347  R.emplace_back(InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
1348  }
1349  return R;
1350 }
1351 
1352 /// Propagate weights into edges
1353 ///
1354 /// The following rules are applied to every block BB in the CFG:
1355 ///
1356 /// - If BB has a single predecessor/successor, then the weight
1357 /// of that edge is the weight of the block.
1358 ///
1359 /// - If all incoming or outgoing edges are known except one, and the
1360 /// weight of the block is already known, the weight of the unknown
1361 /// edge will be the weight of the block minus the sum of all the known
1362 /// edges. If the sum of all the known edges is larger than BB's weight,
1363 /// we set the unknown edge weight to zero.
1364 ///
1365 /// - If there is a self-referential edge, and the weight of the block is
1366 /// known, the weight for that edge is set to the weight of the block
1367 /// minus the weight of the other incoming edges to that block (if
1368 /// known).
1369 void SampleProfileLoader::propagateWeights(Function &F) {
1370  bool Changed = true;
1371  unsigned I = 0;
1372 
1373  // If BB weight is larger than its corresponding loop's header BB weight,
1374  // use the BB weight to replace the loop header BB weight.
1375  for (auto &BI : F) {
1376  BasicBlock *BB = &BI;
1377  Loop *L = LI->getLoopFor(BB);
1378  if (!L) {
1379  continue;
1380  }
1381  BasicBlock *Header = L->getHeader();
1382  if (Header && BlockWeights[BB] > BlockWeights[Header]) {
1383  BlockWeights[Header] = BlockWeights[BB];
1384  }
1385  }
1386 
1387  // Before propagation starts, build, for each block, a list of
1388  // unique predecessors and successors. This is necessary to handle
1389  // identical edges in multiway branches. Since we visit all blocks and all
1390  // edges of the CFG, it is cleaner to build these lists once at the start
1391  // of the pass.
1392  buildEdges(F);
1393 
1394  // Propagate until we converge or we go past the iteration limit.
1395  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1396  Changed = propagateThroughEdges(F, false);
1397  }
1398 
1399  // The first propagation propagates BB counts from annotated BBs to unknown
1400  // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
1401  // to propagate edge weights.
1402  VisitedEdges.clear();
1403  Changed = true;
1404  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1405  Changed = propagateThroughEdges(F, false);
1406  }
1407 
1408  // The 3rd propagation pass allows adjust annotated BB weights that are
1409  // obviously wrong.
1410  Changed = true;
1411  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1412  Changed = propagateThroughEdges(F, true);
1413  }
1414 
1415  // Generate MD_prof metadata for every branch instruction using the
1416  // edge weights computed during propagation.
1417  LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1418  LLVMContext &Ctx = F.getContext();
1419  MDBuilder MDB(Ctx);
1420  for (auto &BI : F) {
1421  BasicBlock *BB = &BI;
1422 
1423  if (BlockWeights[BB]) {
1424  for (auto &I : BB->getInstList()) {
1425  if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1426  continue;
1427  CallSite CS(&I);
1428  if (!CS.getCalledFunction()) {
1429  const DebugLoc &DLoc = I.getDebugLoc();
1430  if (!DLoc)
1431  continue;
1432  const DILocation *DIL = DLoc;
1433  uint32_t LineOffset = FunctionSamples::getOffset(DIL);
1434  uint32_t Discriminator = DIL->getBaseDiscriminator();
1435 
1436  const FunctionSamples *FS = findFunctionSamples(I);
1437  if (!FS)
1438  continue;
1439  auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
1440  if (!T || T.get().empty())
1441  continue;
1442  SmallVector<InstrProfValueData, 2> SortedCallTargets =
1444  uint64_t Sum;
1445  findIndirectCallFunctionSamples(I, Sum);
1446  annotateValueSite(*I.getParent()->getParent()->getParent(), I,
1447  SortedCallTargets, Sum, IPVK_IndirectCallTarget,
1448  SortedCallTargets.size());
1449  } else if (!isa<IntrinsicInst>(&I)) {
1450  I.setMetadata(LLVMContext::MD_prof,
1451  MDB.createBranchWeights(
1452  {static_cast<uint32_t>(BlockWeights[BB])}));
1453  }
1454  }
1455  }
1456  Instruction *TI = BB->getTerminator();
1457  if (TI->getNumSuccessors() == 1)
1458  continue;
1459  if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
1460  continue;
1461 
1462  DebugLoc BranchLoc = TI->getDebugLoc();
1463  LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1464  << ((BranchLoc) ? Twine(BranchLoc.getLine())
1465  : Twine("<UNKNOWN LOCATION>"))
1466  << ".\n");
1467  SmallVector<uint32_t, 4> Weights;
1468  uint32_t MaxWeight = 0;
1469  Instruction *MaxDestInst;
1470  for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1471  BasicBlock *Succ = TI->getSuccessor(I);
1472  Edge E = std::make_pair(BB, Succ);
1473  uint64_t Weight = EdgeWeights[E];
1474  LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1475  // Use uint32_t saturated arithmetic to adjust the incoming weights,
1476  // if needed. Sample counts in profiles are 64-bit unsigned values,
1477  // but internally branch weights are expressed as 32-bit values.
1478  if (Weight > std::numeric_limits<uint32_t>::max()) {
1479  LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1481  }
1482  // Weight is added by one to avoid propagation errors introduced by
1483  // 0 weights.
1484  Weights.push_back(static_cast<uint32_t>(Weight + 1));
1485  if (Weight != 0) {
1486  if (Weight > MaxWeight) {
1487  MaxWeight = Weight;
1488  MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1489  }
1490  }
1491  }
1492 
1493  misexpect::verifyMisExpect(TI, Weights, TI->getContext());
1494 
1495  uint64_t TempWeight;
1496  // Only set weights if there is at least one non-zero weight.
1497  // In any other case, let the analyzer set weights.
1498  // Do not set weights if the weights are present. In ThinLTO, the profile
1499  // annotation is done twice. If the first annotation already set the
1500  // weights, the second pass does not need to set it.
1501  if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
1502  LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1503  TI->setMetadata(LLVMContext::MD_prof,
1504  MDB.createBranchWeights(Weights));
1505  ORE->emit([&]() {
1506  return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1507  << "most popular destination for conditional branches at "
1508  << ore::NV("CondBranchesLoc", BranchLoc);
1509  });
1510  } else {
1511  LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1512  }
1513  }
1514 }
1515 
1516 /// Get the line number for the function header.
1517 ///
1518 /// This looks up function \p F in the current compilation unit and
1519 /// retrieves the line number where the function is defined. This is
1520 /// line 0 for all the samples read from the profile file. Every line
1521 /// number is relative to this line.
1522 ///
1523 /// \param F Function object to query.
1524 ///
1525 /// \returns the line number where \p F is defined. If it returns 0,
1526 /// it means that there is no debug information available for \p F.
1527 unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
1528  if (DISubprogram *S = F.getSubprogram())
1529  return S->getLine();
1530 
1531  if (NoWarnSampleUnused)
1532  return 0;
1533 
1534  // If the start of \p F is missing, emit a diagnostic to inform the user
1535  // about the missed opportunity.
1537  "No debug information found in function " + F.getName() +
1538  ": Function profile not used",
1539  DS_Warning));
1540  return 0;
1541 }
1542 
1543 void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
1544  DT.reset(new DominatorTree);
1545  DT->recalculate(F);
1546 
1547  PDT.reset(new PostDominatorTree(F));
1548 
1549  LI.reset(new LoopInfo);
1550  LI->analyze(*DT);
1551 }
1552 
1553 /// Generate branch weight metadata for all branches in \p F.
1554 ///
1555 /// Branch weights are computed out of instruction samples using a
1556 /// propagation heuristic. Propagation proceeds in 3 phases:
1557 ///
1558 /// 1- Assignment of block weights. All the basic blocks in the function
1559 /// are initial assigned the same weight as their most frequently
1560 /// executed instruction.
1561 ///
1562 /// 2- Creation of equivalence classes. Since samples may be missing from
1563 /// blocks, we can fill in the gaps by setting the weights of all the
1564 /// blocks in the same equivalence class to the same weight. To compute
1565 /// the concept of equivalence, we use dominance and loop information.
1566 /// Two blocks B1 and B2 are in the same equivalence class if B1
1567 /// dominates B2, B2 post-dominates B1 and both are in the same loop.
1568 ///
1569 /// 3- Propagation of block weights into edges. This uses a simple
1570 /// propagation heuristic. The following rules are applied to every
1571 /// block BB in the CFG:
1572 ///
1573 /// - If BB has a single predecessor/successor, then the weight
1574 /// of that edge is the weight of the block.
1575 ///
1576 /// - If all the edges are known except one, and the weight of the
1577 /// block is already known, the weight of the unknown edge will
1578 /// be the weight of the block minus the sum of all the known
1579 /// edges. If the sum of all the known edges is larger than BB's weight,
1580 /// we set the unknown edge weight to zero.
1581 ///
1582 /// - If there is a self-referential edge, and the weight of the block is
1583 /// known, the weight for that edge is set to the weight of the block
1584 /// minus the weight of the other incoming edges to that block (if
1585 /// known).
1586 ///
1587 /// Since this propagation is not guaranteed to finalize for every CFG, we
1588 /// only allow it to proceed for a limited number of iterations (controlled
1589 /// by -sample-profile-max-propagate-iterations).
1590 ///
1591 /// FIXME: Try to replace this propagation heuristic with a scheme
1592 /// that is guaranteed to finalize. A work-list approach similar to
1593 /// the standard value propagation algorithm used by SSA-CCP might
1594 /// work here.
1595 ///
1596 /// Once all the branch weights are computed, we emit the MD_prof
1597 /// metadata on BB using the computed values for each of its branches.
1598 ///
1599 /// \param F The function to query.
1600 ///
1601 /// \returns true if \p F was modified. Returns false, otherwise.
1602 bool SampleProfileLoader::emitAnnotations(Function &F) {
1603  bool Changed = false;
1604 
1605  if (getFunctionLoc(F) == 0)
1606  return false;
1607 
1608  LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1609  << F.getName() << ": " << getFunctionLoc(F) << "\n");
1610 
1611  DenseSet<GlobalValue::GUID> InlinedGUIDs;
1612  Changed |= inlineHotFunctions(F, InlinedGUIDs);
1613 
1614  // Compute basic block weights.
1615  Changed |= computeBlockWeights(F);
1616 
1617  if (Changed) {
1618  // Add an entry count to the function using the samples gathered at the
1619  // function entry.
1620  // Sets the GUIDs that are inlined in the profiled binary. This is used
1621  // for ThinLink to make correct liveness analysis, and also make the IR
1622  // match the profiled binary before annotation.
1623  F.setEntryCount(
1624  ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
1625  &InlinedGUIDs);
1626 
1627  // Compute dominance and loop info needed for propagation.
1628  computeDominanceAndLoopInfo(F);
1629 
1630  // Find equivalence classes.
1631  findEquivalenceClasses(F);
1632 
1633  // Propagate weights to all edges.
1634  propagateWeights(F);
1635  }
1636 
1637  // If coverage checking was requested, compute it now.
1639  unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI);
1640  unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI);
1641  unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
1642  if (Coverage < SampleProfileRecordCoverage) {
1644  F.getSubprogram()->getFilename(), getFunctionLoc(F),
1645  Twine(Used) + " of " + Twine(Total) + " available profile records (" +
1646  Twine(Coverage) + "%) were applied",
1647  DS_Warning));
1648  }
1649  }
1650 
1652  uint64_t Used = CoverageTracker.getTotalUsedSamples();
1653  uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI);
1654  unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
1655  if (Coverage < SampleProfileSampleCoverage) {
1657  F.getSubprogram()->getFilename(), getFunctionLoc(F),
1658  Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
1659  Twine(Coverage) + "%) were applied",
1660  DS_Warning));
1661  }
1662  }
1663  return Changed;
1664 }
1665 
1667 
1668 INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
1669  "Sample Profile loader", false, false)
1673 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
1674  "Sample Profile loader", false, false)
1675 
1676 bool SampleProfileLoader::doInitialization(Module &M) {
1677  auto &Ctx = M.getContext();
1678 
1679  std::unique_ptr<SampleProfileReaderItaniumRemapper> RemapReader;
1680  auto ReaderOrErr =
1681  SampleProfileReader::create(Filename, Ctx, RemappingFilename);
1682  if (std::error_code EC = ReaderOrErr.getError()) {
1683  std::string Msg = "Could not open profile: " + EC.message();
1684  Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1685  return false;
1686  }
1687  Reader = std::move(ReaderOrErr.get());
1688  Reader->collectFuncsFrom(M);
1689  ProfileIsValid = (Reader->read() == sampleprof_error::success);
1690  PSL = Reader->getProfileSymbolList();
1691 
1692  // While profile-sample-accurate is on, ignore symbol list.
1693  ProfAccForSymsInList =
1695  if (ProfAccForSymsInList) {
1696  NamesInProfile.clear();
1697  if (auto NameTable = Reader->getNameTable())
1698  NamesInProfile.insert(NameTable->begin(), NameTable->end());
1699  }
1700 
1701  return true;
1702 }
1703 
1705  return new SampleProfileLoaderLegacyPass();
1706 }
1707 
1709  return new SampleProfileLoaderLegacyPass(Name);
1710 }
1711 
1712 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
1713  ProfileSummaryInfo *_PSI) {
1714  GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
1715  if (!ProfileIsValid)
1716  return false;
1717 
1718  PSI = _PSI;
1719  if (M.getProfileSummary(/* IsCS */ false) == nullptr)
1720  M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
1722 
1723  // Compute the total number of samples collected in this profile.
1724  for (const auto &I : Reader->getProfiles())
1725  TotalCollectedSamples += I.second.getTotalSamples();
1726 
1727  // Populate the symbol map.
1728  for (const auto &N_F : M.getValueSymbolTable()) {
1729  StringRef OrigName = N_F.getKey();
1730  Function *F = dyn_cast<Function>(N_F.getValue());
1731  if (F == nullptr)
1732  continue;
1733  SymbolMap[OrigName] = F;
1734  auto pos = OrigName.find('.');
1735  if (pos != StringRef::npos) {
1736  StringRef NewName = OrigName.substr(0, pos);
1737  auto r = SymbolMap.insert(std::make_pair(NewName, F));
1738  // Failiing to insert means there is already an entry in SymbolMap,
1739  // thus there are multiple functions that are mapped to the same
1740  // stripped name. In this case of name conflicting, set the value
1741  // to nullptr to avoid confusion.
1742  if (!r.second)
1743  r.first->second = nullptr;
1744  }
1745  }
1746 
1747  bool retval = false;
1748  for (auto &F : M)
1749  if (!F.isDeclaration()) {
1750  clearFunctionData();
1751  retval |= runOnFunction(F, AM);
1752  }
1753 
1754  // Account for cold calls not inlined....
1755  for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
1756  notInlinedCallInfo)
1757  updateProfileCallee(pair.first, pair.second.entryCount);
1758 
1759  return retval;
1760 }
1761 
1762 bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
1763  ACT = &getAnalysis<AssumptionCacheTracker>();
1764  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
1765  ProfileSummaryInfo *PSI =
1766  &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1767  return SampleLoader.runOnModule(M, nullptr, PSI);
1768 }
1769 
1771 
1772  DILocation2SampleMap.clear();
1773  // By default the entry count is initialized to -1, which will be treated
1774  // conservatively by getEntryCount as the same as unknown (None). This is
1775  // to avoid newly added code to be treated as cold. If we have samples
1776  // this will be overwritten in emitAnnotations.
1777  uint64_t initialEntryCount = -1;
1778 
1779  ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
1780  if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
1781  // initialize all the function entry counts to 0. It means all the
1782  // functions without profile will be regarded as cold.
1783  initialEntryCount = 0;
1784  // profile-sample-accurate is a user assertion which has a higher precedence
1785  // than symbol list. When profile-sample-accurate is on, ignore symbol list.
1786  ProfAccForSymsInList = false;
1787  }
1788 
1789  // PSL -- profile symbol list include all the symbols in sampled binary.
1790  // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
1791  // old functions without samples being cold, without having to worry
1792  // about new and hot functions being mistakenly treated as cold.
1793  if (ProfAccForSymsInList) {
1794  // Initialize the entry count to 0 for functions in the list.
1795  if (PSL->contains(F.getName()))
1796  initialEntryCount = 0;
1797 
1798  // Function in the symbol list but without sample will be regarded as
1799  // cold. To minimize the potential negative performance impact it could
1800  // have, we want to be a little conservative here saying if a function
1801  // shows up in the profile, no matter as outline function, inline instance
1802  // or call targets, treat the function as not being cold. This will handle
1803  // the cases such as most callsites of a function are inlined in sampled
1804  // binary but not inlined in current build (because of source code drift,
1805  // imprecise debug information, or the callsites are all cold individually
1806  // but not cold accumulatively...), so the outline function showing up as
1807  // cold in sampled binary will actually not be cold after current build.
1809  if (NamesInProfile.count(CanonName))
1810  initialEntryCount = -1;
1811  }
1812 
1813  F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
1814  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
1815  if (AM) {
1816  auto &FAM =
1818  .getManager();
1819  ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1820  } else {
1821  OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
1822  ORE = OwnedORE.get();
1823  }
1824  Samples = Reader->getSamplesFor(F);
1825  if (Samples && !Samples->empty())
1826  return emitAnnotations(F);
1827  return false;
1828 }
1829 
1831  ModuleAnalysisManager &AM) {
1833  AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1834 
1835  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
1836  return FAM.getResult<AssumptionAnalysis>(F);
1837  };
1838  auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
1839  return FAM.getResult<TargetIRAnalysis>(F);
1840  };
1841 
1842  SampleProfileLoader SampleLoader(
1843  ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
1844  ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
1845  : ProfileRemappingFileName,
1846  IsThinLTOPreLink, GetAssumptionCache, GetTTI);
1847 
1848  SampleLoader.doInitialization(M);
1849 
1851  if (!SampleLoader.runOnModule(M, &AM, PSI))
1852  return PreservedAnalyses::all();
1853 
1854  return PreservedAnalyses::none();
1855 }
uint64_t CallInst * C
const FunctionSamplesMap * findFunctionSamplesMapAt(const LineLocation &Loc) const
Returns the FunctionSamplesMap at the given Loc.
Definition: SampleProf.h:367
static uint64_t getGUID(StringRef Name)
Definition: SampleProf.h:566
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:153
Represents either an error or a value T.
Definition: ErrorOr.h:56
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function< AssumptionCache &(Function &)> &GetAssumptionCache, Optional< function_ref< BlockFrequencyInfo &(Function &)>> GetBFI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
DiagnosticInfoOptimizationBase::Argument NV
bool isNever() const
Definition: InlineCost.h:104
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:777
This class represents lattice values for constants.
Definition: AllocatorList.h:23
sample profile
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:66
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(const std::string Filename, LLVMContext &C, const std::string RemapFilename="")
Create a sample profile reader appropriate to the file format.
BasicBlock * getSuccessor(unsigned Idx) const
Return the specified successor. This instruction must be a terminator.
amdgpu Simplify well known AMD library false FunctionCallee Value const Twine & Name
virtual void collectFuncsFrom(const Module &M)
bool isColdCount(uint64_t C)
Returns true if count C is considered cold.
const ValueSymbolTable & getValueSymbolTable() const
Get the symbol table of global variable and function identifiers.
Definition: Module.h:569
Analysis providing profile information.
This class represents a function call, abstracting a target machine&#39;s calling convention.
An immutable pass that tracks lazily created AssumptionCache objects.
Metadata * getProfileSummary(bool IsCS)
Returns profile summary metadata.
Definition: Module.cpp:541
A cache of @llvm.assume calls within a function.
Analysis pass providing the TargetTransformInfo.
static cl::opt< bool > ProfileAccurateForSymsInList("profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. "))
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:743
bool isLegalToPromote(CallSite CS, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
unsigned getLine() const
Definition: DebugLoc.cpp:25
DenseMap< uint64_t, StringRef > * GUIDToFuncNameMap
GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for all the function symbols define...
Definition: SampleProf.h:561
ErrorOr< uint64_t > findSamplesAt(uint32_t LineOffset, uint32_t Discriminator) const
Return the number of samples collected at the given location.
Definition: SampleProf.h:340
A debug info location.
Definition: DebugLoc.h:33
F(f)
uint64_t getOrCompHotCountThreshold()
Returns HotCountThreshold if set.
FunTy * getCalledFunction() const
Return the function being called if this is a direct call, otherwise return null (if it&#39;s an indirect...
Definition: CallSite.h:111
void findInlinedFunctions(DenseSet< GlobalValue::GUID > &S, const Module *M, uint64_t Threshold) const
Recursively traverses all children, if the total sample count of the corresponding function is no les...
Definition: SampleProf.h:466
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:173
static SmallVector< InstrProfValueData, 2 > GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M)
Returns the sorted CallTargetMap M by count in descending order.
bool isHotCount(uint64_t C)
Returns true if count C is considered hot.
Represents the cost of inlining a function.
Definition: InlineCost.h:63
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Metadata * getMD(LLVMContext &Context)
Return summary information as metadata.
Representation of the samples collected for a function.
Definition: SampleProf.h:300
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:195
static const SortedCallTargetSet SortCallTargets(const CallTargetMap &Targets)
Sort call targets in descending order of call frequency.
Definition: SampleProf.h:259
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
Definition: BitVector.h:937
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:80
void setEntryCount(ProfileCount Count, const DenseSet< GlobalValue::GUID > *Imports=nullptr)
Set the entry count for this function.
Definition: Function.cpp:1529
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
static cl::opt< std::string > SampleProfileRemappingFile("sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden)
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:245
Diagnostic information for optimization analysis remarks.
static StringRef getName(Value *V)
LLVM_NODISCARD StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:592
StringRef getFilename() const
BlockT * getHeader() const
Definition: LoopInfo.h:105
Interval::succ_iterator succ_begin(Interval *I)
succ_begin/succ_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:102
Subprogram description.
const Instruction * getFirstNonPHIOrDbgOrLifetime() const
Returns a pointer to the first instruction in this block that is not a PHINode, a debug intrinsic...
Definition: BasicBlock.cpp:210
bool empty() const
Definition: Module.h:608
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
void verifyMisExpect(llvm::Instruction *I, const llvm::SmallVector< uint32_t, 4 > &Weights, llvm::LLVMContext &Ctx)
verifyMisExpect - compares PGO counters to the thresholds used for llvm.expect and warns if the PGO c...
Definition: MisExpect.cpp:95
bool extractProfTotalWeight(uint64_t &TotalVal) const
Retrieve total raw weight values of a branch.
Definition: Metadata.cpp:1336
static cl::opt< std::string > SampleProfileFile("sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden)
Debug location.
ModulePass * createSampleProfileLoaderPass()
Instruction * promoteIndirectCall(Instruction *Inst, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
static cl::opt< unsigned > SampleProfileRecordCoverage("sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"), cl::desc("Emit a warning if less than N% of records in the input profile " "are matched to the IR."))
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:157
Interval::succ_iterator succ_end(Interval *I)
Definition: Interval.h:105
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
const BasicBlock & getEntryBlock() const
Definition: Function.h:669
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:154
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:64
static cl::opt< unsigned > SampleProfileMaxPropagateIterations("sample-profile-max-propagate-iterations", cl::init(100), cl::desc("Maximum number of iterations to go through when propagating " "sample block/edge weights through the CFG."))
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1522
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
const BodySampleMap & getBodySamples() const
Return all the samples collected in the body of the function.
Definition: SampleProf.h:434
FunctionSamples * getSamplesFor(const Function &F)
Return the samples collected for function F.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
std::error_code read()
The interface to read sample profiles from the associated file.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
Diagnostic information for applied optimization remarks.
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:112
static StringRef getCanonicalFnName(const Function &F)
Return the canonical name for a function, taking into account suffix elision policy attributes...
Definition: SampleProf.h:499
LLVM_NODISCARD size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition: StringRef.h:299
Function::ProfileCount ProfileCount
Represent the analysis usage information of a pass.
Interval::pred_iterator pred_end(Interval *I)
Definition: Interval.h:115
DenseMap< SymbolStringPtr, JITEvaluatedSymbol > SymbolMap
A map from symbol names (as SymbolStringPtrs) to JITSymbols (address/flags pairs).
Definition: Core.h:50
void initializeSampleProfileLoaderLegacyPassPass(PassRegistry &)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:205
InlineResult InlineFunction(CallBase *CB, InlineFunctionInfo &IFI, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true)
This function inlines the called function into the basic block of the caller.
Used in the streaming interface as the general argument type.
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:931
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:160
std::map< std::string, FunctionSamples, std::less<> > FunctionSamplesMap
Definition: SampleProf.h:292
Class to represent profile counts.
Definition: Function.h:260
size_t size() const
Definition: SmallVector.h:52
MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight)
Return metadata containing two branch weights.
Definition: MDBuilder.cpp:37
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1222
StringRef getFuncNameInModule(const Module *M) const
Return the original function name if it exists in Module M.
Definition: SampleProf.h:493
Optional< bool > ComputeFullInlineCost
Compute inline cost even when the cost has exceeded the threshold.
Definition: InlineCost.h:180
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1095
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:197
const FunctionSamples * findFunctionSamplesAt(const LineLocation &Loc, StringRef CalleeName) const
Returns a pointer to FunctionSamples at the given callsite location Loc with callee CalleeName...
Definition: SampleProf.h:378
A function analysis which provides an AssumptionCache.
virtual std::unique_ptr< ProfileSymbolList > getProfileSymbolList()
const InstListType & getInstList() const
Return the underlying instruction list container.
Definition: BasicBlock.h:338
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:390
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options...
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
Module.h This file contains the declarations for the Module class.
static unsigned getOffset(const DILocation *DIL)
Returns the line offset to the start line of the subprogram.
Definition: SampleProf.cpp:171
virtual std::vector< StringRef > * getNameTable()
It includes all the names that have samples either in outline instance or inline instance.
static cl::opt< bool > ProfileSampleAccurate("profile-sample-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. "))
ErrorOr< SampleRecord::CallTargetMap > findCallTargetMapAt(uint32_t LineOffset, uint32_t Discriminator) const
Returns the call target map collected at a given location.
Definition: SampleProf.h:353
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
StringMap - This is an unconventional map that is specialized for handling keys that are "strings"...
Definition: StringMap.h:242
StringMap< FunctionSamples > & getProfiles()
Return all the profiles.
amdgpu Simplify well known AMD library false FunctionCallee Callee
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:510
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:225
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:467
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
unsigned getBaseDiscriminator() const
Returns the base discriminator stored in the discriminator.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:331
#define DEBUG_TYPE
static const size_t npos
Definition: StringRef.h:50
Represents the relative location of an instruction.
Definition: SampleProf.h:179
static cl::opt< unsigned > SampleProfileSampleCoverage("sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"), cl::desc("Emit a warning if less than N% of samples in the input profile " "are matched to the IR."))
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:509
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
Establish a view to a call site for examination.
Definition: CallSite.h:906
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
#define I(x, y, z)
Definition: MD5.cpp:58
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:224
iterator end()
Definition: DenseMap.h:82
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
StringRef getName() const
Return the function name.
Definition: SampleProf.h:490
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:91
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
void updateProfileCallee(Function *Callee, int64_t entryDelta, const ValueMap< const Value *, WeakTrackingVH > *VMap=nullptr)
Updates profile information by adjusting the entry count by adding entryDelta then scaling callsite i...
INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:231
Provides ErrorOr<T> smart pointer.
ProfileSummary & getSummary() const
Return the profile summary.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
uint64_t getTotalSamples() const
Return the total number of samples collected inside the function.
Definition: SampleProf.h:404
sample Sample Profile loader
const CallsiteSampleMap & getCallsiteSamples() const
Return all the callsite samples collected in the body of the function.
Definition: SampleProf.h:437
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:27
Sample-based profile reader.
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
print Print MemDeps of function
This file defines a set of templates that efficiently compute a dominator tree over a generic graph...
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
A container for analyses that lazily runs them and caches their results.
This pass exposes codegen information to IR-level passes.
uint64_t getEntrySamples() const
Return the sample count of the first instruction of the function.
Definition: SampleProf.h:415
static cl::opt< bool > NoWarnSampleUnused("no-warn-sample-unused", cl::init(false), cl::Hidden, cl::desc("Use this option to turn off/on warnings about function with " "samples but without debug information to use those samples. "))
This header defines various interfaces for pass management in LLVM.
Diagnostic information for the sample profiler.
void setProfileSummary(Metadata *M, ProfileSummary::Kind Kind)
Attach profile summary metadata to this module.
Definition: Module.cpp:534
#define LLVM_DEBUG(X)
Definition: Debug.h:122
void dump(raw_ostream &OS=dbgs())
Print all the profiles on stream OS.
The optimization diagnostic interface.
This file provides the interface for the sampled PGO loader pass.
reference get()
Definition: ErrorOr.h:156
const BasicBlock * getParent() const
Definition: Instruction.h:66
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:1045