LLVM 18.0.0git
SampleProfile.cpp
Go to the documentation of this file.
1//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SampleProfileLoader transformation. This pass
10// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12// profile information in the given profile.
13//
14// This pass generates branch weight annotations on the IR:
15//
16// - prof: Represents branch weights. This annotation is added to branches
17// to indicate the weights of each edge coming out of the branch.
18// The weight of each edge is the weight of the target block for
19// that edge. The weight of a block B is computed as the maximum
20// number of samples found in B.
21//
22//===----------------------------------------------------------------------===//
23
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
28#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/StringMap.h"
34#include "llvm/ADT/StringRef.h"
35#include "llvm/ADT/Twine.h"
46#include "llvm/IR/BasicBlock.h"
47#include "llvm/IR/DebugLoc.h"
49#include "llvm/IR/Function.h"
50#include "llvm/IR/GlobalValue.h"
51#include "llvm/IR/InstrTypes.h"
52#include "llvm/IR/Instruction.h"
55#include "llvm/IR/LLVMContext.h"
56#include "llvm/IR/MDBuilder.h"
57#include "llvm/IR/Module.h"
58#include "llvm/IR/PassManager.h"
59#include "llvm/IR/PseudoProbe.h"
66#include "llvm/Support/Debug.h"
70#include "llvm/Transforms/IPO.h"
80#include <algorithm>
81#include <cassert>
82#include <cstdint>
83#include <functional>
84#include <limits>
85#include <map>
86#include <memory>
87#include <queue>
88#include <string>
89#include <system_error>
90#include <utility>
91#include <vector>
92
93using namespace llvm;
94using namespace sampleprof;
95using namespace llvm::sampleprofutil;
97#define DEBUG_TYPE "sample-profile"
98#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
99
100STATISTIC(NumCSInlined,
101 "Number of functions inlined with context sensitive profile");
102STATISTIC(NumCSNotInlined,
103 "Number of functions not inlined with context sensitive profile");
104STATISTIC(NumMismatchedProfile,
105 "Number of functions with CFG mismatched profile");
106STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
107STATISTIC(NumDuplicatedInlinesite,
108 "Number of inlined callsites with a partial distribution factor");
109
110STATISTIC(NumCSInlinedHitMinLimit,
111 "Number of functions with FDO inline stopped due to min size limit");
112STATISTIC(NumCSInlinedHitMaxLimit,
113 "Number of functions with FDO inline stopped due to max size limit");
115 NumCSInlinedHitGrowthLimit,
116 "Number of functions with FDO inline stopped due to growth size limit");
117
118// Command line option to specify the file to read samples from. This is
119// mainly used for debugging.
121 "sample-profile-file", cl::init(""), cl::value_desc("filename"),
122 cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
123
124// The named file contains a set of transformations that may have been applied
125// to the symbol names between the program from which the sample data was
126// collected and the current program's symbols.
128 "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
129 cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
130
132 "salvage-stale-profile", cl::Hidden, cl::init(false),
133 cl::desc("Salvage stale profile by fuzzy matching and use the remapped "
134 "location for sample profile query."));
135
137 "report-profile-staleness", cl::Hidden, cl::init(false),
138 cl::desc("Compute and report stale profile statistical metrics."));
139
141 "persist-profile-staleness", cl::Hidden, cl::init(false),
142 cl::desc("Compute stale profile statistical metrics and write it into the "
143 "native object file(.llvm_stats section)."));
144
146 "profile-sample-accurate", cl::Hidden, cl::init(false),
147 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
148 "callsite and function as having 0 samples. Otherwise, treat "
149 "un-sampled callsites and functions conservatively as unknown. "));
150
152 "profile-sample-block-accurate", cl::Hidden, cl::init(false),
153 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
154 "branches and calls as having 0 samples. Otherwise, treat "
155 "them conservatively as unknown. "));
156
158 "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
159 cl::desc("For symbols in profile symbol list, regard their profiles to "
160 "be accurate. It may be overriden by profile-sample-accurate. "));
161
163 "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
164 cl::desc("Merge past inlinee's profile to outline version if sample "
165 "profile loader decided not to inline a call site. It will "
166 "only be enabled when top-down order of profile loading is "
167 "enabled. "));
168
170 "sample-profile-top-down-load", cl::Hidden, cl::init(true),
171 cl::desc("Do profile annotation and inlining for functions in top-down "
172 "order of call graph during sample profile loading. It only "
173 "works for new pass manager. "));
174
175static cl::opt<bool>
176 UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
177 cl::desc("Process functions in a top-down order "
178 "defined by the profiled call graph when "
179 "-sample-profile-top-down-load is on."));
180
182 "sample-profile-inline-size", cl::Hidden, cl::init(false),
183 cl::desc("Inline cold call sites in profile loader if it's beneficial "
184 "for code size."));
185
186// Since profiles are consumed by many passes, turning on this option has
187// side effects. For instance, pre-link SCC inliner would see merged profiles
188// and inline the hot functions (that are skipped in this pass).
190 "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
191 cl::desc("If true, artifically skip inline transformation in sample-loader "
192 "pass, and merge (or scale) profiles (as configured by "
193 "--sample-profile-merge-inlinee)."));
194
195namespace llvm {
197 SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
198 cl::desc("Sort profiled recursion by edge weights."));
199
201 "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
202 cl::desc("The size growth ratio limit for proirity-based sample profile "
203 "loader inlining."));
204
206 "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
207 cl::desc("The lower bound of size growth limit for "
208 "proirity-based sample profile loader inlining."));
209
211 "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
212 cl::desc("The upper bound of size growth limit for "
213 "proirity-based sample profile loader inlining."));
214
216 "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
217 cl::desc("Hot callsite threshold for proirity-based sample profile loader "
218 "inlining."));
219
221 "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
222 cl::desc("Threshold for inlining cold callsites"));
223} // namespace llvm
224
226 "sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25),
227 cl::desc(
228 "Relative hotness percentage threshold for indirect "
229 "call promotion in proirity-based sample profile loader inlining."));
230
232 "sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1),
233 cl::desc(
234 "Skip relative hotness check for ICP up to given number of targets."));
235
237 "sample-profile-prioritized-inline", cl::Hidden,
238
239 cl::desc("Use call site prioritized inlining for sample profile loader."
240 "Currently only CSSPGO is supported."));
241
243 "sample-profile-use-preinliner", cl::Hidden,
244
245 cl::desc("Use the preinliner decisions stored in profile context."));
246
248 "sample-profile-recursive-inline", cl::Hidden,
249
250 cl::desc("Allow sample loader inliner to inline recursive calls."));
251
253 "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
254 cl::desc(
255 "Optimization remarks file containing inline remarks to be replayed "
256 "by inlining from sample profile loader."),
257 cl::Hidden);
258
260 "sample-profile-inline-replay-scope",
261 cl::init(ReplayInlinerSettings::Scope::Function),
262 cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
263 "Replay on functions that have remarks associated "
264 "with them (default)"),
265 clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
266 "Replay on the entire module")),
267 cl::desc("Whether inline replay should be applied to the entire "
268 "Module or just the Functions (default) that are present as "
269 "callers in remarks during sample profile inlining."),
270 cl::Hidden);
271
273 "sample-profile-inline-replay-fallback",
274 cl::init(ReplayInlinerSettings::Fallback::Original),
277 ReplayInlinerSettings::Fallback::Original, "Original",
278 "All decisions not in replay send to original advisor (default)"),
279 clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
280 "AlwaysInline", "All decisions not in replay are inlined"),
281 clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
282 "All decisions not in replay are not inlined")),
283 cl::desc("How sample profile inline replay treats sites that don't come "
284 "from the replay. Original: defers to original advisor, "
285 "AlwaysInline: inline all sites not in replay, NeverInline: "
286 "inline no sites not in replay"),
287 cl::Hidden);
288
290 "sample-profile-inline-replay-format",
291 cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
293 clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
294 clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
295 "<Line Number>:<Column Number>"),
296 clEnumValN(CallSiteFormat::Format::LineDiscriminator,
297 "LineDiscriminator", "<Line Number>.<Discriminator>"),
298 clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
299 "LineColumnDiscriminator",
300 "<Line Number>:<Column Number>.<Discriminator> (default)")),
301 cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
302
304 MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
305 cl::desc("Max number of promotions for a single indirect "
306 "call callsite in sample profile loader"));
307
309 "overwrite-existing-weights", cl::Hidden, cl::init(false),
310 cl::desc("Ignore existing branch weights on IR and always overwrite."));
311
313 "annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),
314 cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "
315 "sample-profile inline pass name."));
316
317namespace llvm {
319}
320
321namespace {
322
323using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
324using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
325using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
326using EdgeWeightMap = DenseMap<Edge, uint64_t>;
327using BlockEdgeMap =
329
330class GUIDToFuncNameMapper {
331public:
332 GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
333 DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
334 : CurrentReader(Reader), CurrentModule(M),
335 CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
336 if (!CurrentReader.useMD5())
337 return;
338
339 for (const auto &F : CurrentModule) {
340 StringRef OrigName = F.getName();
341 CurrentGUIDToFuncNameMap.insert(
342 {Function::getGUID(OrigName), OrigName});
343
344 // Local to global var promotion used by optimization like thinlto
345 // will rename the var and add suffix like ".llvm.xxx" to the
346 // original local name. In sample profile, the suffixes of function
347 // names are all stripped. Since it is possible that the mapper is
348 // built in post-thin-link phase and var promotion has been done,
349 // we need to add the substring of function name without the suffix
350 // into the GUIDToFuncNameMap.
352 if (CanonName != OrigName)
353 CurrentGUIDToFuncNameMap.insert(
354 {Function::getGUID(CanonName), CanonName});
355 }
356
357 // Update GUIDToFuncNameMap for each function including inlinees.
358 SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
359 }
360
361 ~GUIDToFuncNameMapper() {
362 if (!CurrentReader.useMD5())
363 return;
364
365 CurrentGUIDToFuncNameMap.clear();
366
367 // Reset GUIDToFuncNameMap for of each function as they're no
368 // longer valid at this point.
369 SetGUIDToFuncNameMapForAll(nullptr);
370 }
371
372private:
373 void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
374 std::queue<FunctionSamples *> FSToUpdate;
375 for (auto &IFS : CurrentReader.getProfiles()) {
376 FSToUpdate.push(&IFS.second);
377 }
378
379 while (!FSToUpdate.empty()) {
380 FunctionSamples *FS = FSToUpdate.front();
381 FSToUpdate.pop();
382 FS->GUIDToFuncNameMap = Map;
383 for (const auto &ICS : FS->getCallsiteSamples()) {
384 const FunctionSamplesMap &FSMap = ICS.second;
385 for (const auto &IFS : FSMap) {
386 FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
387 FSToUpdate.push(&FS);
388 }
389 }
390 }
391 }
392
394 Module &CurrentModule;
395 DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
396};
397
398// Inline candidate used by iterative callsite prioritized inliner
399struct InlineCandidate {
400 CallBase *CallInstr;
401 const FunctionSamples *CalleeSamples;
402 // Prorated callsite count, which will be used to guide inlining. For example,
403 // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
404 // copies will get their own distribution factors and their prorated counts
405 // will be used to decide if they should be inlined independently.
406 uint64_t CallsiteCount;
407 // Call site distribution factor to prorate the profile samples for a
408 // duplicated callsite. Default value is 1.0.
409 float CallsiteDistribution;
410};
411
412// Inline candidate comparer using call site weight
413struct CandidateComparer {
414 bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
415 if (LHS.CallsiteCount != RHS.CallsiteCount)
416 return LHS.CallsiteCount < RHS.CallsiteCount;
417
418 const FunctionSamples *LCS = LHS.CalleeSamples;
419 const FunctionSamples *RCS = RHS.CalleeSamples;
420 assert(LCS && RCS && "Expect non-null FunctionSamples");
421
422 // Tie breaker using number of samples try to favor smaller functions first
423 if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
424 return LCS->getBodySamples().size() > RCS->getBodySamples().size();
425
426 // Tie breaker using GUID so we have stable/deterministic inlining order
427 return LCS->getGUID(LCS->getName()) < RCS->getGUID(RCS->getName());
428 }
429};
430
431using CandidateQueue =
433 CandidateComparer>;
434
435// Sample profile matching - fuzzy match.
436class SampleProfileMatcher {
437 Module &M;
438 SampleProfileReader &Reader;
439 const PseudoProbeManager *ProbeManager;
440 SampleProfileMap FlattenedProfiles;
441 // For each function, the matcher generates a map, of which each entry is a
442 // mapping from the source location of current build to the source location in
443 // the profile.
444 StringMap<LocToLocMap> FuncMappings;
445
446 // Profile mismatching statstics.
447 uint64_t TotalProfiledCallsites = 0;
448 uint64_t NumMismatchedCallsites = 0;
449 uint64_t MismatchedCallsiteSamples = 0;
450 uint64_t TotalCallsiteSamples = 0;
451 uint64_t TotalProfiledFunc = 0;
452 uint64_t NumMismatchedFuncHash = 0;
453 uint64_t MismatchedFuncHashSamples = 0;
454 uint64_t TotalFuncHashSamples = 0;
455
456 // A dummy name for unknown indirect callee, used to differentiate from a
457 // non-call instruction that also has an empty callee name.
458 static constexpr const char *UnknownIndirectCallee =
459 "unknown.indirect.callee";
460
461public:
462 SampleProfileMatcher(Module &M, SampleProfileReader &Reader,
463 const PseudoProbeManager *ProbeManager)
464 : M(M), Reader(Reader), ProbeManager(ProbeManager){};
465 void runOnModule();
466
467private:
468 FunctionSamples *getFlattenedSamplesFor(const Function &F) {
470 auto It = FlattenedProfiles.find(CanonFName);
471 if (It != FlattenedProfiles.end())
472 return &It->second;
473 return nullptr;
474 }
475 void runOnFunction(const Function &F);
476 void findIRAnchors(const Function &F,
477 std::map<LineLocation, StringRef> &IRAnchors);
478 void findProfileAnchors(const FunctionSamples &FS,
479 std::map<LineLocation, StringSet<>> &ProfileAnchors);
480 void countMismatchedSamples(const FunctionSamples &FS);
481 void countProfileMismatches(
482 const Function &F, const FunctionSamples &FS,
483 const std::map<LineLocation, StringRef> &IRAnchors,
484 const std::map<LineLocation, StringSet<>> &ProfileAnchors);
485 void countProfileCallsiteMismatches(
486 const FunctionSamples &FS,
487 const std::map<LineLocation, StringRef> &IRAnchors,
488 const std::map<LineLocation, StringSet<>> &ProfileAnchors,
489
490 uint64_t &FuncMismatchedCallsites, uint64_t &FuncProfiledCallsites);
491 LocToLocMap &getIRToProfileLocationMap(const Function &F) {
492 auto Ret = FuncMappings.try_emplace(
494 return Ret.first->second;
495 }
496 void distributeIRToProfileLocationMap();
497 void distributeIRToProfileLocationMap(FunctionSamples &FS);
498 void runStaleProfileMatching(
499 const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
500 const std::map<LineLocation, StringSet<>> &ProfileAnchors,
501 LocToLocMap &IRToProfileLocationMap);
502};
503
504/// Sample profile pass.
505///
506/// This pass reads profile data from the file specified by
507/// -sample-profile-file and annotates every affected function with the
508/// profile information found in that file.
509class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> {
510public:
511 SampleProfileLoader(
512 StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
514 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
515 std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
516 std::function<const TargetLibraryInfo &(Function &)> GetTLI)
518 std::move(FS)),
519 GetAC(std::move(GetAssumptionCache)),
520 GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
521 LTOPhase(LTOPhase),
522 AnnotatedPassName(AnnotateSampleProfileInlinePhase
525 : CSINLINE_DEBUG) {}
526
527 bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
528 bool runOnModule(Module &M, ModuleAnalysisManager *AM,
530
531protected:
533 bool emitAnnotations(Function &F);
535 const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
536 const FunctionSamples *
537 findFunctionSamples(const Instruction &I) const override;
538 std::vector<const FunctionSamples *>
539 findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
540 void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
541 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
542 const StringMap<Function *> &SymbolMap,
543 uint64_t Threshold);
544 // Attempt to promote indirect call and also inline the promoted call
545 bool tryPromoteAndInlineCandidate(
546 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
547 uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
548
549 bool inlineHotFunctions(Function &F,
550 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
551 std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
552 bool getExternalInlineAdvisorShouldInline(CallBase &CB);
553 InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
554 bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
555 bool
556 tryInlineCandidate(InlineCandidate &Candidate,
557 SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
558 bool
559 inlineHotFunctionsWithPriority(Function &F,
560 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
561 // Inline cold/small functions in addition to hot ones
562 bool shouldInlineColdCallee(CallBase &CallInst);
563 void emitOptimizationRemarksForInlineCandidates(
564 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
565 bool Hot);
566 void promoteMergeNotInlinedContextSamples(
568 const Function &F);
569 std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG);
570 std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M);
571 void generateMDProfMetadata(Function &F);
572
573 /// Map from function name to Function *. Used to find the function from
574 /// the function name. If the function name contains suffix, additional
575 /// entry is added to map from the stripped name to the function if there
576 /// is one-to-one mapping.
578
579 std::function<AssumptionCache &(Function &)> GetAC;
580 std::function<TargetTransformInfo &(Function &)> GetTTI;
581 std::function<const TargetLibraryInfo &(Function &)> GetTLI;
582
583 /// Profile tracker for different context.
584 std::unique_ptr<SampleContextTracker> ContextTracker;
585
586 /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
587 ///
588 /// We need to know the LTO phase because for example in ThinLTOPrelink
589 /// phase, in annotation, we should not promote indirect calls. Instead,
590 /// we will mark GUIDs that needs to be annotated to the function.
591 const ThinOrFullLTOPhase LTOPhase;
592 const std::string AnnotatedPassName;
593
594 /// Profle Symbol list tells whether a function name appears in the binary
595 /// used to generate the current profile.
596 std::unique_ptr<ProfileSymbolList> PSL;
597
598 /// Total number of samples collected in this profile.
599 ///
600 /// This is the sum of all the samples collected in all the functions executed
601 /// at runtime.
602 uint64_t TotalCollectedSamples = 0;
603
604 // Information recorded when we declined to inline a call site
605 // because we have determined it is too cold is accumulated for
606 // each callee function. Initially this is just the entry count.
607 struct NotInlinedProfileInfo {
608 uint64_t entryCount;
609 };
611
612 // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
613 // all the function symbols defined or declared in current module.
614 DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
615
616 // All the Names used in FunctionSamples including outline function
617 // names, inline instance names and call target names.
618 StringSet<> NamesInProfile;
619
620 // For symbol in profile symbol list, whether to regard their profiles
621 // to be accurate. It is mainly decided by existance of profile symbol
622 // list and -profile-accurate-for-symsinlist flag, but it can be
623 // overriden by -profile-sample-accurate or profile-sample-accurate
624 // attribute.
625 bool ProfAccForSymsInList;
626
627 // External inline advisor used to replay inline decision from remarks.
628 std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
629
630 // A helper to implement the sample profile matching algorithm.
631 std::unique_ptr<SampleProfileMatcher> MatchingManager;
632
633private:
634 const char *getAnnotatedRemarkPassName() const {
635 return AnnotatedPassName.c_str();
636 }
637};
638} // end anonymous namespace
639
640namespace llvm {
641template <>
643 return succ_empty(BB);
644}
645
646template <>
648 const std::vector<const BasicBlockT *> &BasicBlocks,
649 BlockEdgeMap &Successors, FlowFunction &Func) {
650 for (auto &Jump : Func.Jumps) {
651 const auto *BB = BasicBlocks[Jump.Source];
652 const auto *Succ = BasicBlocks[Jump.Target];
653 const Instruction *TI = BB->getTerminator();
654 // Check if a block ends with InvokeInst and mark non-taken branch unlikely.
655 // In that case block Succ should be a landing pad
656 if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) {
657 if (isa<InvokeInst>(TI)) {
658 Jump.IsUnlikely = true;
659 }
660 }
661 const Instruction *SuccTI = Succ->getTerminator();
662 // Check if the target block contains UnreachableInst and mark it unlikely
663 if (SuccTI->getNumSuccessors() == 0) {
664 if (isa<UnreachableInst>(SuccTI)) {
665 Jump.IsUnlikely = true;
666 }
667 }
668 }
669}
670
671template <>
673 Function &F) {
674 DT.reset(new DominatorTree);
675 DT->recalculate(F);
676
677 PDT.reset(new PostDominatorTree(F));
678
679 LI.reset(new LoopInfo);
680 LI->analyze(*DT);
681}
682} // namespace llvm
683
684ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
686 return getProbeWeight(Inst);
687
688 const DebugLoc &DLoc = Inst.getDebugLoc();
689 if (!DLoc)
690 return std::error_code();
691
692 // Ignore all intrinsics, phinodes and branch instructions.
693 // Branch and phinodes instruction usually contains debug info from sources
694 // outside of the residing basic block, thus we ignore them during annotation.
695 if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
696 return std::error_code();
697
698 // For non-CS profile, if a direct call/invoke instruction is inlined in
699 // profile (findCalleeFunctionSamples returns non-empty result), but not
700 // inlined here, it means that the inlined callsite has no sample, thus the
701 // call instruction should have 0 count.
702 // For CS profile, the callsite count of previously inlined callees is
703 // populated with the entry count of the callees.
705 if (const auto *CB = dyn_cast<CallBase>(&Inst))
706 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
707 return 0;
708
709 return getInstWeightImpl(Inst);
710}
711
712/// Get the FunctionSamples for a call instruction.
713///
714/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
715/// instance in which that call instruction is calling to. It contains
716/// all samples that resides in the inlined instance. We first find the
717/// inlined instance in which the call instruction is from, then we
718/// traverse its children to find the callsite with the matching
719/// location.
720///
721/// \param Inst Call/Invoke instruction to query.
722///
723/// \returns The FunctionSamples pointer to the inlined instance.
724const FunctionSamples *
725SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
726 const DILocation *DIL = Inst.getDebugLoc();
727 if (!DIL) {
728 return nullptr;
729 }
730
731 StringRef CalleeName;
732 if (Function *Callee = Inst.getCalledFunction())
733 CalleeName = Callee->getName();
734
736 return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
737
738 const FunctionSamples *FS = findFunctionSamples(Inst);
739 if (FS == nullptr)
740 return nullptr;
741
742 return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
743 CalleeName, Reader->getRemapper());
744}
745
746/// Returns a vector of FunctionSamples that are the indirect call targets
747/// of \p Inst. The vector is sorted by the total number of samples. Stores
748/// the total call count of the indirect call in \p Sum.
749std::vector<const FunctionSamples *>
750SampleProfileLoader::findIndirectCallFunctionSamples(
751 const Instruction &Inst, uint64_t &Sum) const {
752 const DILocation *DIL = Inst.getDebugLoc();
753 std::vector<const FunctionSamples *> R;
754
755 if (!DIL) {
756 return R;
757 }
758
759 auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
760 assert(L && R && "Expect non-null FunctionSamples");
761 if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())
762 return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();
763 return FunctionSamples::getGUID(L->getName()) <
764 FunctionSamples::getGUID(R->getName());
765 };
766
768 auto CalleeSamples =
769 ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
770 if (CalleeSamples.empty())
771 return R;
772
773 // For CSSPGO, we only use target context profile's entry count
774 // as that already includes both inlined callee and non-inlined ones..
775 Sum = 0;
776 for (const auto *const FS : CalleeSamples) {
777 Sum += FS->getHeadSamplesEstimate();
778 R.push_back(FS);
779 }
780 llvm::sort(R, FSCompare);
781 return R;
782 }
783
784 const FunctionSamples *FS = findFunctionSamples(Inst);
785 if (FS == nullptr)
786 return R;
787
789 auto T = FS->findCallTargetMapAt(CallSite);
790 Sum = 0;
791 if (T)
792 for (const auto &T_C : T.get())
793 Sum += T_C.second;
794 if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
795 if (M->empty())
796 return R;
797 for (const auto &NameFS : *M) {
798 Sum += NameFS.second.getHeadSamplesEstimate();
799 R.push_back(&NameFS.second);
800 }
801 llvm::sort(R, FSCompare);
802 }
803 return R;
804}
805
806const FunctionSamples *
807SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
809 std::optional<PseudoProbe> Probe = extractProbe(Inst);
810 if (!Probe)
811 return nullptr;
812 }
813
814 const DILocation *DIL = Inst.getDebugLoc();
815 if (!DIL)
816 return Samples;
817
818 auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
819 if (it.second) {
821 it.first->second = ContextTracker->getContextSamplesFor(DIL);
822 else
823 it.first->second =
824 Samples->findFunctionSamples(DIL, Reader->getRemapper());
825 }
826 return it.first->second;
827}
828
829/// Check whether the indirect call promotion history of \p Inst allows
830/// the promotion for \p Candidate.
831/// If the profile count for the promotion candidate \p Candidate is
832/// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
833/// for \p Inst. If we already have at least MaxNumPromotions
834/// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
835/// cannot promote for \p Inst anymore.
836static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
837 uint32_t NumVals = 0;
838 uint64_t TotalCount = 0;
839 std::unique_ptr<InstrProfValueData[]> ValueData =
840 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
841 bool Valid =
842 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
843 ValueData.get(), NumVals, TotalCount, true);
844 // No valid value profile so no promoted targets have been recorded
845 // before. Ok to do ICP.
846 if (!Valid)
847 return true;
848
849 unsigned NumPromoted = 0;
850 for (uint32_t I = 0; I < NumVals; I++) {
851 if (ValueData[I].Count != NOMORE_ICP_MAGICNUM)
852 continue;
853
854 // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
855 // metadata, it means the candidate has been promoted for this
856 // indirect call.
857 if (ValueData[I].Value == Function::getGUID(Candidate))
858 return false;
859 NumPromoted++;
860 // If already have MaxNumPromotions promotion, don't do it anymore.
861 if (NumPromoted == MaxNumPromotions)
862 return false;
863 }
864 return true;
865}
866
867/// Update indirect call target profile metadata for \p Inst.
868/// Usually \p Sum is the sum of counts of all the targets for \p Inst.
869/// If it is 0, it means updateIDTMetaData is used to mark a
870/// certain target to be promoted already. If it is not zero,
871/// we expect to use it to update the total count in the value profile.
872static void
874 const SmallVectorImpl<InstrProfValueData> &CallTargets,
875 uint64_t Sum) {
876 // Bail out early if MaxNumPromotions is zero.
877 // This prevents allocating an array of zero length below.
878 //
879 // Note `updateIDTMetaData` is called in two places so check
880 // `MaxNumPromotions` inside it.
881 if (MaxNumPromotions == 0)
882 return;
883 uint32_t NumVals = 0;
884 // OldSum is the existing total count in the value profile data.
885 uint64_t OldSum = 0;
886 std::unique_ptr<InstrProfValueData[]> ValueData =
887 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
888 bool Valid =
889 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
890 ValueData.get(), NumVals, OldSum, true);
891
892 DenseMap<uint64_t, uint64_t> ValueCountMap;
893 if (Sum == 0) {
894 assert((CallTargets.size() == 1 &&
895 CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&
896 "If sum is 0, assume only one element in CallTargets "
897 "with count being NOMORE_ICP_MAGICNUM");
898 // Initialize ValueCountMap with existing value profile data.
899 if (Valid) {
900 for (uint32_t I = 0; I < NumVals; I++)
901 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
902 }
903 auto Pair =
904 ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
905 // If the target already exists in value profile, decrease the total
906 // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.
907 if (!Pair.second) {
908 OldSum -= Pair.first->second;
909 Pair.first->second = NOMORE_ICP_MAGICNUM;
910 }
911 Sum = OldSum;
912 } else {
913 // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
914 // counts in the value profile.
915 if (Valid) {
916 for (uint32_t I = 0; I < NumVals; I++) {
917 if (ValueData[I].Count == NOMORE_ICP_MAGICNUM)
918 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
919 }
920 }
921
922 for (const auto &Data : CallTargets) {
923 auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
924 if (Pair.second)
925 continue;
926 // The target represented by Data.Value has already been promoted.
927 // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease
928 // Sum by Data.Count.
929 assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
930 Sum -= Data.Count;
931 }
932 }
933
935 for (const auto &ValueCount : ValueCountMap) {
936 NewCallTargets.emplace_back(
937 InstrProfValueData{ValueCount.first, ValueCount.second});
938 }
939
940 llvm::sort(NewCallTargets,
941 [](const InstrProfValueData &L, const InstrProfValueData &R) {
942 if (L.Count != R.Count)
943 return L.Count > R.Count;
944 return L.Value > R.Value;
945 });
946
947 uint32_t MaxMDCount =
948 std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
950 NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);
951}
952
953/// Attempt to promote indirect call and also inline the promoted call.
954///
955/// \param F Caller function.
956/// \param Candidate ICP and inline candidate.
957/// \param SumOrigin Original sum of target counts for indirect call before
958/// promoting given candidate.
959/// \param Sum Prorated sum of remaining target counts for indirect call
960/// after promoting given candidate.
961/// \param InlinedCallSite Output vector for new call sites exposed after
962/// inlining.
963bool SampleProfileLoader::tryPromoteAndInlineCandidate(
964 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
965 SmallVector<CallBase *, 8> *InlinedCallSite) {
966 // Bail out early if sample-loader inliner is disabled.
968 return false;
969
970 // Bail out early if MaxNumPromotions is zero.
971 // This prevents allocating an array of zero length in callees below.
972 if (MaxNumPromotions == 0)
973 return false;
974 auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName();
975 auto R = SymbolMap.find(CalleeFunctionName);
976 if (R == SymbolMap.end() || !R->getValue())
977 return false;
978
979 auto &CI = *Candidate.CallInstr;
980 if (!doesHistoryAllowICP(CI, R->getValue()->getName()))
981 return false;
982
983 const char *Reason = "Callee function not available";
984 // R->getValue() != &F is to prevent promoting a recursive call.
985 // If it is a recursive call, we do not inline it as it could bloat
986 // the code exponentially. There is way to better handle this, e.g.
987 // clone the caller first, and inline the cloned caller if it is
988 // recursive. As llvm does not inline recursive calls, we will
989 // simply ignore it instead of handling it explicitly.
990 if (!R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
991 R->getValue()->hasFnAttribute("use-sample-profile") &&
992 R->getValue() != &F && isLegalToPromote(CI, R->getValue(), &Reason)) {
993 // For promoted target, set its value with NOMORE_ICP_MAGICNUM count
994 // in the value profile metadata so the target won't be promoted again.
995 SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
996 Function::getGUID(R->getValue()->getName()), NOMORE_ICP_MAGICNUM}};
997 updateIDTMetaData(CI, SortedCallTargets, 0);
998
999 auto *DI = &pgo::promoteIndirectCall(
1000 CI, R->getValue(), Candidate.CallsiteCount, Sum, false, ORE);
1001 if (DI) {
1002 Sum -= Candidate.CallsiteCount;
1003 // Do not prorate the indirect callsite distribution since the original
1004 // distribution will be used to scale down non-promoted profile target
1005 // counts later. By doing this we lose track of the real callsite count
1006 // for the leftover indirect callsite as a trade off for accurate call
1007 // target counts.
1008 // TODO: Ideally we would have two separate factors, one for call site
1009 // counts and one is used to prorate call target counts.
1010 // Do not update the promoted direct callsite distribution at this
1011 // point since the original distribution combined with the callee profile
1012 // will be used to prorate callsites from the callee if inlined. Once not
1013 // inlined, the direct callsite distribution should be prorated so that
1014 // the it will reflect the real callsite counts.
1015 Candidate.CallInstr = DI;
1016 if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
1017 bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
1018 if (!Inlined) {
1019 // Prorate the direct callsite distribution so that it reflects real
1020 // callsite counts.
1022 *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
1023 }
1024 return Inlined;
1025 }
1026 }
1027 } else {
1028 LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
1029 << Candidate.CalleeSamples->getFuncName() << " because "
1030 << Reason << "\n");
1031 }
1032 return false;
1033}
1034
1035bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
1036 if (!ProfileSizeInline)
1037 return false;
1038
1040 if (Callee == nullptr)
1041 return false;
1042
1044 GetAC, GetTLI);
1045
1046 if (Cost.isNever())
1047 return false;
1048
1049 if (Cost.isAlways())
1050 return true;
1051
1052 return Cost.getCost() <= SampleColdCallSiteThreshold;
1053}
1054
1055void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
1056 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
1057 bool Hot) {
1058 for (auto *I : Candidates) {
1059 Function *CalledFunction = I->getCalledFunction();
1060 if (CalledFunction) {
1061 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1062 "InlineAttempt", I->getDebugLoc(),
1063 I->getParent())
1064 << "previous inlining reattempted for "
1065 << (Hot ? "hotness: '" : "size: '")
1066 << ore::NV("Callee", CalledFunction) << "' into '"
1067 << ore::NV("Caller", &F) << "'");
1068 }
1069 }
1070}
1071
1072void SampleProfileLoader::findExternalInlineCandidate(
1073 CallBase *CB, const FunctionSamples *Samples,
1074 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
1075 const StringMap<Function *> &SymbolMap, uint64_t Threshold) {
1076
1077 // If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external
1078 // function make sure it's imported
1079 if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
1080 // Samples may not exist for replayed function, if so
1081 // just add the direct GUID and move on
1082 if (!Samples) {
1083 InlinedGUIDs.insert(
1085 return;
1086 }
1087 // Otherwise, drop the threshold to import everything that we can
1088 Threshold = 0;
1089 }
1090
1091 // In some rare cases, call instruction could be changed after being pushed
1092 // into inline candidate queue, this is because earlier inlining may expose
1093 // constant propagation which can change indirect call to direct call. When
1094 // this happens, we may fail to find matching function samples for the
1095 // candidate later, even if a match was found when the candidate was enqueued.
1096 if (!Samples)
1097 return;
1098
1099 // For AutoFDO profile, retrieve candidate profiles by walking over
1100 // the nested inlinee profiles.
1102 Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
1103 return;
1104 }
1105
1106 ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);
1107 std::queue<ContextTrieNode *> CalleeList;
1108 CalleeList.push(Caller);
1109 while (!CalleeList.empty()) {
1110 ContextTrieNode *Node = CalleeList.front();
1111 CalleeList.pop();
1112 FunctionSamples *CalleeSample = Node->getFunctionSamples();
1113 // For CSSPGO profile, retrieve candidate profile by walking over the
1114 // trie built for context profile. Note that also take call targets
1115 // even if callee doesn't have a corresponding context profile.
1116 if (!CalleeSample)
1117 continue;
1118
1119 // If pre-inliner decision is used, honor that for importing as well.
1120 bool PreInline =
1123 if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)
1124 continue;
1125
1126 StringRef Name = CalleeSample->getFuncName();
1128 // Add to the import list only when it's defined out of module.
1129 if (!Func || Func->isDeclaration())
1130 InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeSample->getName()));
1131
1132 // Import hot CallTargets, which may not be available in IR because full
1133 // profile annotation cannot be done until backend compilation in ThinLTO.
1134 for (const auto &BS : CalleeSample->getBodySamples())
1135 for (const auto &TS : BS.second.getCallTargets())
1136 if (TS.getValue() > Threshold) {
1137 StringRef CalleeName = CalleeSample->getFuncName(TS.getKey());
1138 const Function *Callee = SymbolMap.lookup(CalleeName);
1139 if (!Callee || Callee->isDeclaration())
1140 InlinedGUIDs.insert(FunctionSamples::getGUID(TS.getKey()));
1141 }
1142
1143 // Import hot child context profile associted with callees. Note that this
1144 // may have some overlap with the call target loop above, but doing this
1145 // based child context profile again effectively allow us to use the max of
1146 // entry count and call target count to determine importing.
1147 for (auto &Child : Node->getAllChildContext()) {
1148 ContextTrieNode *CalleeNode = &Child.second;
1149 CalleeList.push(CalleeNode);
1150 }
1151 }
1152}
1153
1154/// Iteratively inline hot callsites of a function.
1155///
1156/// Iteratively traverse all callsites of the function \p F, so as to
1157/// find out callsites with corresponding inline instances.
1158///
1159/// For such callsites,
1160/// - If it is hot enough, inline the callsites and adds callsites of the callee
1161/// into the caller. If the call is an indirect call, first promote
1162/// it to direct call. Each indirect call is limited with a single target.
1163///
1164/// - If a callsite is not inlined, merge the its profile to the outline
1165/// version (if --sample-profile-merge-inlinee is true), or scale the
1166/// counters of standalone function based on the profile of inlined
1167/// instances (if --sample-profile-merge-inlinee is false).
1168///
1169/// Later passes may consume the updated profiles.
1170///
1171/// \param F function to perform iterative inlining.
1172/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
1173/// inlined in the profiled binary.
1174///
1175/// \returns True if there is any inline happened.
1176bool SampleProfileLoader::inlineHotFunctions(
1177 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1178 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1179 // Profile symbol list is ignored when profile-sample-accurate is on.
1180 assert((!ProfAccForSymsInList ||
1182 !F.hasFnAttribute("profile-sample-accurate"))) &&
1183 "ProfAccForSymsInList should be false when profile-sample-accurate "
1184 "is enabled");
1185
1186 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1187 bool Changed = false;
1188 bool LocalChanged = true;
1189 while (LocalChanged) {
1190 LocalChanged = false;
1192 for (auto &BB : F) {
1193 bool Hot = false;
1194 SmallVector<CallBase *, 10> AllCandidates;
1195 SmallVector<CallBase *, 10> ColdCandidates;
1196 for (auto &I : BB) {
1197 const FunctionSamples *FS = nullptr;
1198 if (auto *CB = dyn_cast<CallBase>(&I)) {
1199 if (!isa<IntrinsicInst>(I)) {
1200 if ((FS = findCalleeFunctionSamples(*CB))) {
1201 assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
1202 "GUIDToFuncNameMap has to be populated");
1203 AllCandidates.push_back(CB);
1204 if (FS->getHeadSamplesEstimate() > 0 ||
1206 LocalNotInlinedCallSites.insert({CB, FS});
1207 if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1208 Hot = true;
1209 else if (shouldInlineColdCallee(*CB))
1210 ColdCandidates.push_back(CB);
1211 } else if (getExternalInlineAdvisorShouldInline(*CB)) {
1212 AllCandidates.push_back(CB);
1213 }
1214 }
1215 }
1216 }
1217 if (Hot || ExternalInlineAdvisor) {
1218 CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
1219 emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
1220 } else {
1221 CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
1222 emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
1223 }
1224 }
1225 for (CallBase *I : CIS) {
1226 Function *CalledFunction = I->getCalledFunction();
1227 InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
1228 0 /* dummy count */,
1229 1.0 /* dummy distribution factor */};
1230 // Do not inline recursive calls.
1231 if (CalledFunction == &F)
1232 continue;
1233 if (I->isIndirectCall()) {
1234 uint64_t Sum;
1235 for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
1236 uint64_t SumOrigin = Sum;
1237 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1238 findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
1239 PSI->getOrCompHotCountThreshold());
1240 continue;
1241 }
1242 if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1243 continue;
1244
1245 Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};
1246 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
1247 LocalNotInlinedCallSites.erase(I);
1248 LocalChanged = true;
1249 }
1250 }
1251 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1252 !CalledFunction->isDeclaration()) {
1253 if (tryInlineCandidate(Candidate)) {
1254 LocalNotInlinedCallSites.erase(I);
1255 LocalChanged = true;
1256 }
1257 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1258 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1259 InlinedGUIDs, SymbolMap,
1260 PSI->getOrCompHotCountThreshold());
1261 }
1262 }
1263 Changed |= LocalChanged;
1264 }
1265
1266 // For CS profile, profile for not inlined context will be merged when
1267 // base profile is being retrieved.
1269 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1270 return Changed;
1271}
1272
1273bool SampleProfileLoader::tryInlineCandidate(
1274 InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
1275 // Do not attempt to inline a candidate if
1276 // --disable-sample-loader-inlining is true.
1278 return false;
1279
1280 CallBase &CB = *Candidate.CallInstr;
1281 Function *CalledFunction = CB.getCalledFunction();
1282 assert(CalledFunction && "Expect a callee with definition");
1283 DebugLoc DLoc = CB.getDebugLoc();
1284 BasicBlock *BB = CB.getParent();
1285
1286 InlineCost Cost = shouldInlineCandidate(Candidate);
1287 if (Cost.isNever()) {
1288 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1289 "InlineFail", DLoc, BB)
1290 << "incompatible inlining");
1291 return false;
1292 }
1293
1294 if (!Cost)
1295 return false;
1296
1297 InlineFunctionInfo IFI(GetAC);
1298 IFI.UpdateProfile = false;
1299 InlineResult IR = InlineFunction(CB, IFI,
1300 /*MergeAttributes=*/true);
1301 if (!IR.isSuccess())
1302 return false;
1303
1304 // The call to InlineFunction erases I, so we can't pass it here.
1305 emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),
1306 Cost, true, getAnnotatedRemarkPassName());
1307
1308 // Now populate the list of newly exposed call sites.
1309 if (InlinedCallSites) {
1310 InlinedCallSites->clear();
1311 for (auto &I : IFI.InlinedCallSites)
1312 InlinedCallSites->push_back(I);
1313 }
1314
1316 ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
1317 ++NumCSInlined;
1318
1319 // Prorate inlined probes for a duplicated inlining callsite which probably
1320 // has a distribution less than 100%. Samples for an inlinee should be
1321 // distributed among the copies of the original callsite based on each
1322 // callsite's distribution factor for counts accuracy. Note that an inlined
1323 // probe may come with its own distribution factor if it has been duplicated
1324 // in the inlinee body. The two factor are multiplied to reflect the
1325 // aggregation of duplication.
1326 if (Candidate.CallsiteDistribution < 1) {
1327 for (auto &I : IFI.InlinedCallSites) {
1328 if (std::optional<PseudoProbe> Probe = extractProbe(*I))
1329 setProbeDistributionFactor(*I, Probe->Factor *
1330 Candidate.CallsiteDistribution);
1331 }
1332 NumDuplicatedInlinesite++;
1333 }
1334
1335 return true;
1336}
1337
1338bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
1339 CallBase *CB) {
1340 assert(CB && "Expect non-null call instruction");
1341
1342 if (isa<IntrinsicInst>(CB))
1343 return false;
1344
1345 // Find the callee's profile. For indirect call, find hottest target profile.
1346 const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
1347 // If ExternalInlineAdvisor wants to inline this site, do so even
1348 // if Samples are not present.
1349 if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
1350 return false;
1351
1352 float Factor = 1.0;
1353 if (std::optional<PseudoProbe> Probe = extractProbe(*CB))
1354 Factor = Probe->Factor;
1355
1356 uint64_t CallsiteCount =
1357 CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;
1358 *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
1359 return true;
1360}
1361
1362std::optional<InlineCost>
1363SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
1364 std::unique_ptr<InlineAdvice> Advice = nullptr;
1365 if (ExternalInlineAdvisor) {
1366 Advice = ExternalInlineAdvisor->getAdvice(CB);
1367 if (Advice) {
1368 if (!Advice->isInliningRecommended()) {
1369 Advice->recordUnattemptedInlining();
1370 return InlineCost::getNever("not previously inlined");
1371 }
1372 Advice->recordInlining();
1373 return InlineCost::getAlways("previously inlined");
1374 }
1375 }
1376
1377 return {};
1378}
1379
1380bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
1381 std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
1382 return Cost ? !!*Cost : false;
1383}
1384
1386SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
1387 if (std::optional<InlineCost> ReplayCost =
1388 getExternalInlineAdvisorCost(*Candidate.CallInstr))
1389 return *ReplayCost;
1390 // Adjust threshold based on call site hotness, only do this for callsite
1391 // prioritized inliner because otherwise cost-benefit check is done earlier.
1392 int SampleThreshold = SampleColdCallSiteThreshold;
1394 if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
1395 SampleThreshold = SampleHotCallSiteThreshold;
1396 else if (!ProfileSizeInline)
1397 return InlineCost::getNever("cold callsite");
1398 }
1399
1400 Function *Callee = Candidate.CallInstr->getCalledFunction();
1401 assert(Callee && "Expect a definition for inline candidate of direct call");
1402
1403 InlineParams Params = getInlineParams();
1404 // We will ignore the threshold from inline cost, so always get full cost.
1405 Params.ComputeFullInlineCost = true;
1407 // Checks if there is anything in the reachable portion of the callee at
1408 // this callsite that makes this inlining potentially illegal. Need to
1409 // set ComputeFullInlineCost, otherwise getInlineCost may return early
1410 // when cost exceeds threshold without checking all IRs in the callee.
1411 // The acutal cost does not matter because we only checks isNever() to
1412 // see if it is legal to inline the callsite.
1413 InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
1414 GetTTI(*Callee), GetAC, GetTLI);
1415
1416 // Honor always inline and never inline from call analyzer
1417 if (Cost.isNever() || Cost.isAlways())
1418 return Cost;
1419
1420 // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
1421 // decisions based on hotness as well as accurate function byte sizes for
1422 // given context using function/inlinee sizes from previous build. It
1423 // stores the decision in profile, and also adjust/merge context profile
1424 // aiming at better context-sensitive post-inline profile quality, assuming
1425 // all inline decision estimates are going to be honored by compiler. Here
1426 // we replay that inline decision under `sample-profile-use-preinliner`.
1427 // Note that we don't need to handle negative decision from preinliner as
1428 // context profile for not inlined calls are merged by preinliner already.
1429 if (UsePreInlinerDecision && Candidate.CalleeSamples) {
1430 // Once two node are merged due to promotion, we're losing some context
1431 // so the original context-sensitive preinliner decision should be ignored
1432 // for SyntheticContext.
1433 SampleContext &Context = Candidate.CalleeSamples->getContext();
1434 if (!Context.hasState(SyntheticContext) &&
1435 Context.hasAttribute(ContextShouldBeInlined))
1436 return InlineCost::getAlways("preinliner");
1437 }
1438
1439 // For old FDO inliner, we inline the call site as long as cost is not
1440 // "Never". The cost-benefit check is done earlier.
1442 return InlineCost::get(Cost.getCost(), INT_MAX);
1443 }
1444
1445 // Otherwise only use the cost from call analyzer, but overwite threshold with
1446 // Sample PGO threshold.
1447 return InlineCost::get(Cost.getCost(), SampleThreshold);
1448}
1449
1450bool SampleProfileLoader::inlineHotFunctionsWithPriority(
1451 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1452 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1453 // Profile symbol list is ignored when profile-sample-accurate is on.
1454 assert((!ProfAccForSymsInList ||
1456 !F.hasFnAttribute("profile-sample-accurate"))) &&
1457 "ProfAccForSymsInList should be false when profile-sample-accurate "
1458 "is enabled");
1459
1460 // Populating worklist with initial call sites from root inliner, along
1461 // with call site weights.
1462 CandidateQueue CQueue;
1463 InlineCandidate NewCandidate;
1464 for (auto &BB : F) {
1465 for (auto &I : BB) {
1466 auto *CB = dyn_cast<CallBase>(&I);
1467 if (!CB)
1468 continue;
1469 if (getInlineCandidate(&NewCandidate, CB))
1470 CQueue.push(NewCandidate);
1471 }
1472 }
1473
1474 // Cap the size growth from profile guided inlining. This is needed even
1475 // though cost of each inline candidate already accounts for callee size,
1476 // because with top-down inlining, we can grow inliner size significantly
1477 // with large number of smaller inlinees each pass the cost check.
1479 "Max inline size limit should not be smaller than min inline size "
1480 "limit.");
1481 unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
1482 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
1483 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
1484 if (ExternalInlineAdvisor)
1485 SizeLimit = std::numeric_limits<unsigned>::max();
1486
1487 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1488
1489 // Perform iterative BFS call site prioritized inlining
1490 bool Changed = false;
1491 while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
1492 InlineCandidate Candidate = CQueue.top();
1493 CQueue.pop();
1494 CallBase *I = Candidate.CallInstr;
1495 Function *CalledFunction = I->getCalledFunction();
1496
1497 if (CalledFunction == &F)
1498 continue;
1499 if (I->isIndirectCall()) {
1500 uint64_t Sum = 0;
1501 auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
1502 uint64_t SumOrigin = Sum;
1503 Sum *= Candidate.CallsiteDistribution;
1504 unsigned ICPCount = 0;
1505 for (const auto *FS : CalleeSamples) {
1506 // TODO: Consider disable pre-lTO ICP for MonoLTO as well
1507 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1508 findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
1509 PSI->getOrCompHotCountThreshold());
1510 continue;
1511 }
1512 uint64_t EntryCountDistributed =
1513 FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;
1514 // In addition to regular inline cost check, we also need to make sure
1515 // ICP isn't introducing excessive speculative checks even if individual
1516 // target looks beneficial to promote and inline. That means we should
1517 // only do ICP when there's a small number dominant targets.
1518 if (ICPCount >= ProfileICPRelativeHotnessSkip &&
1519 EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness)
1520 break;
1521 // TODO: Fix CallAnalyzer to handle all indirect calls.
1522 // For indirect call, we don't run CallAnalyzer to get InlineCost
1523 // before actual inlining. This is because we could see two different
1524 // types from the same definition, which makes CallAnalyzer choke as
1525 // it's expecting matching parameter type on both caller and callee
1526 // side. See example from PR18962 for the triggering cases (the bug was
1527 // fixed, but we generate different types).
1528 if (!PSI->isHotCount(EntryCountDistributed))
1529 break;
1530 SmallVector<CallBase *, 8> InlinedCallSites;
1531 // Attach function profile for promoted indirect callee, and update
1532 // call site count for the promoted inline candidate too.
1533 Candidate = {I, FS, EntryCountDistributed,
1534 Candidate.CallsiteDistribution};
1535 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
1536 &InlinedCallSites)) {
1537 for (auto *CB : InlinedCallSites) {
1538 if (getInlineCandidate(&NewCandidate, CB))
1539 CQueue.emplace(NewCandidate);
1540 }
1541 ICPCount++;
1542 Changed = true;
1543 } else if (!ContextTracker) {
1544 LocalNotInlinedCallSites.insert({I, FS});
1545 }
1546 }
1547 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1548 !CalledFunction->isDeclaration()) {
1549 SmallVector<CallBase *, 8> InlinedCallSites;
1550 if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
1551 for (auto *CB : InlinedCallSites) {
1552 if (getInlineCandidate(&NewCandidate, CB))
1553 CQueue.emplace(NewCandidate);
1554 }
1555 Changed = true;
1556 } else if (!ContextTracker) {
1557 LocalNotInlinedCallSites.insert({I, Candidate.CalleeSamples});
1558 }
1559 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1560 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1561 InlinedGUIDs, SymbolMap,
1562 PSI->getOrCompHotCountThreshold());
1563 }
1564 }
1565
1566 if (!CQueue.empty()) {
1567 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
1568 ++NumCSInlinedHitMaxLimit;
1569 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
1570 ++NumCSInlinedHitMinLimit;
1571 else
1572 ++NumCSInlinedHitGrowthLimit;
1573 }
1574
1575 // For CS profile, profile for not inlined context will be merged when
1576 // base profile is being retrieved.
1578 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1579 return Changed;
1580}
1581
1582void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
1584 const Function &F) {
1585 // Accumulate not inlined callsite information into notInlinedSamples
1586 for (const auto &Pair : NonInlinedCallSites) {
1587 CallBase *I = Pair.first;
1588 Function *Callee = I->getCalledFunction();
1589 if (!Callee || Callee->isDeclaration())
1590 continue;
1591
1592 ORE->emit(
1593 OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",
1594 I->getDebugLoc(), I->getParent())
1595 << "previous inlining not repeated: '" << ore::NV("Callee", Callee)
1596 << "' into '" << ore::NV("Caller", &F) << "'");
1597
1598 ++NumCSNotInlined;
1599 const FunctionSamples *FS = Pair.second;
1600 if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {
1601 continue;
1602 }
1603
1604 // Do not merge a context that is already duplicated into the base profile.
1605 if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))
1606 continue;
1607
1608 if (ProfileMergeInlinee) {
1609 // A function call can be replicated by optimizations like callsite
1610 // splitting or jump threading and the replicates end up sharing the
1611 // sample nested callee profile instead of slicing the original
1612 // inlinee's profile. We want to do merge exactly once by filtering out
1613 // callee profiles with a non-zero head sample count.
1614 if (FS->getHeadSamples() == 0) {
1615 // Use entry samples as head samples during the merge, as inlinees
1616 // don't have head samples.
1617 const_cast<FunctionSamples *>(FS)->addHeadSamples(
1618 FS->getHeadSamplesEstimate());
1619
1620 // Note that we have to do the merge right after processing function.
1621 // This allows OutlineFS's profile to be used for annotation during
1622 // top-down processing of functions' annotation.
1623 FunctionSamples *OutlineFS = Reader->getSamplesFor(*Callee);
1624 // If outlined function does not exist in the profile, add it to a
1625 // separate map so that it does not rehash the original profile.
1626 if (!OutlineFS)
1627 OutlineFS = &OutlineFunctionSamples[
1629 OutlineFS->merge(*FS, 1);
1630 // Set outlined profile to be synthetic to not bias the inliner.
1631 OutlineFS->SetContextSynthetic();
1632 }
1633 } else {
1634 auto pair =
1635 notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
1636 pair.first->second.entryCount += FS->getHeadSamplesEstimate();
1637 }
1638 }
1639}
1640
1641/// Returns the sorted CallTargetMap \p M by count in descending order.
1645 for (const auto &I : SampleRecord::SortCallTargets(M)) {
1646 R.emplace_back(
1647 InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
1648 }
1649 return R;
1650}
1651
1652// Generate MD_prof metadata for every branch instruction using the
1653// edge weights computed during propagation.
1654void SampleProfileLoader::generateMDProfMetadata(Function &F) {
1655 // Generate MD_prof metadata for every branch instruction using the
1656 // edge weights computed during propagation.
1657 LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1658 LLVMContext &Ctx = F.getContext();
1659 MDBuilder MDB(Ctx);
1660 for (auto &BI : F) {
1661 BasicBlock *BB = &BI;
1662
1663 if (BlockWeights[BB]) {
1664 for (auto &I : *BB) {
1665 if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1666 continue;
1667 if (!cast<CallBase>(I).getCalledFunction()) {
1668 const DebugLoc &DLoc = I.getDebugLoc();
1669 if (!DLoc)
1670 continue;
1671 const DILocation *DIL = DLoc;
1672 const FunctionSamples *FS = findFunctionSamples(I);
1673 if (!FS)
1674 continue;
1676 auto T = FS->findCallTargetMapAt(CallSite);
1677 if (!T || T.get().empty())
1678 continue;
1680 // Prorate the callsite counts based on the pre-ICP distribution
1681 // factor to reflect what is already done to the callsite before
1682 // ICP, such as calliste cloning.
1683 if (std::optional<PseudoProbe> Probe = extractProbe(I)) {
1684 if (Probe->Factor < 1)
1685 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
1686 }
1687 }
1688 SmallVector<InstrProfValueData, 2> SortedCallTargets =
1690 uint64_t Sum = 0;
1691 for (const auto &C : T.get())
1692 Sum += C.second;
1693 // With CSSPGO all indirect call targets are counted torwards the
1694 // original indirect call site in the profile, including both
1695 // inlined and non-inlined targets.
1697 if (const FunctionSamplesMap *M =
1698 FS->findFunctionSamplesMapAt(CallSite)) {
1699 for (const auto &NameFS : *M)
1700 Sum += NameFS.second.getHeadSamplesEstimate();
1701 }
1702 }
1703 if (Sum)
1704 updateIDTMetaData(I, SortedCallTargets, Sum);
1705 else if (OverwriteExistingWeights)
1706 I.setMetadata(LLVMContext::MD_prof, nullptr);
1707 } else if (!isa<IntrinsicInst>(&I)) {
1708 I.setMetadata(LLVMContext::MD_prof,
1709 MDB.createBranchWeights(
1710 {static_cast<uint32_t>(BlockWeights[BB])}));
1711 }
1712 }
1714 // Set profile metadata (possibly annotated by LTO prelink) to zero or
1715 // clear it for cold code.
1716 for (auto &I : *BB) {
1717 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1718 if (cast<CallBase>(I).isIndirectCall())
1719 I.setMetadata(LLVMContext::MD_prof, nullptr);
1720 else
1721 I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(0));
1722 }
1723 }
1724 }
1725
1726 Instruction *TI = BB->getTerminator();
1727 if (TI->getNumSuccessors() == 1)
1728 continue;
1729 if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
1730 !isa<IndirectBrInst>(TI))
1731 continue;
1732
1733 DebugLoc BranchLoc = TI->getDebugLoc();
1734 LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1735 << ((BranchLoc) ? Twine(BranchLoc.getLine())
1736 : Twine("<UNKNOWN LOCATION>"))
1737 << ".\n");
1739 uint32_t MaxWeight = 0;
1740 Instruction *MaxDestInst;
1741 // Since profi treats multiple edges (multiway branches) as a single edge,
1742 // we need to distribute the computed weight among the branches. We do
1743 // this by evenly splitting the edge weight among destinations.
1745 std::vector<uint64_t> EdgeIndex;
1747 EdgeIndex.resize(TI->getNumSuccessors());
1748 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1749 const BasicBlock *Succ = TI->getSuccessor(I);
1750 EdgeIndex[I] = EdgeMultiplicity[Succ];
1751 EdgeMultiplicity[Succ]++;
1752 }
1753 }
1754 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1755 BasicBlock *Succ = TI->getSuccessor(I);
1756 Edge E = std::make_pair(BB, Succ);
1757 uint64_t Weight = EdgeWeights[E];
1758 LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1759 // Use uint32_t saturated arithmetic to adjust the incoming weights,
1760 // if needed. Sample counts in profiles are 64-bit unsigned values,
1761 // but internally branch weights are expressed as 32-bit values.
1762 if (Weight > std::numeric_limits<uint32_t>::max()) {
1763 LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1764 Weight = std::numeric_limits<uint32_t>::max();
1765 }
1766 if (!SampleProfileUseProfi) {
1767 // Weight is added by one to avoid propagation errors introduced by
1768 // 0 weights.
1769 Weights.push_back(static_cast<uint32_t>(Weight + 1));
1770 } else {
1771 // Profi creates proper weights that do not require "+1" adjustments but
1772 // we evenly split the weight among branches with the same destination.
1773 uint64_t W = Weight / EdgeMultiplicity[Succ];
1774 // Rounding up, if needed, so that first branches are hotter.
1775 if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
1776 W++;
1777 Weights.push_back(static_cast<uint32_t>(W));
1778 }
1779 if (Weight != 0) {
1780 if (Weight > MaxWeight) {
1781 MaxWeight = Weight;
1782 MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1783 }
1784 }
1785 }
1786
1787 misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
1788
1789 uint64_t TempWeight;
1790 // Only set weights if there is at least one non-zero weight.
1791 // In any other case, let the analyzer set weights.
1792 // Do not set weights if the weights are present unless under
1793 // OverwriteExistingWeights. In ThinLTO, the profile annotation is done
1794 // twice. If the first annotation already set the weights, the second pass
1795 // does not need to set it. With OverwriteExistingWeights, Blocks with zero
1796 // weight should have their existing metadata (possibly annotated by LTO
1797 // prelink) cleared.
1798 if (MaxWeight > 0 &&
1799 (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
1800 LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1801 TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
1802 ORE->emit([&]() {
1803 return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1804 << "most popular destination for conditional branches at "
1805 << ore::NV("CondBranchesLoc", BranchLoc);
1806 });
1807 } else {
1809 TI->setMetadata(LLVMContext::MD_prof, nullptr);
1810 LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
1811 } else {
1812 LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1813 }
1814 }
1815 }
1816}
1817
1818/// Once all the branch weights are computed, we emit the MD_prof
1819/// metadata on BB using the computed values for each of its branches.
1820///
1821/// \param F The function to query.
1822///
1823/// \returns true if \p F was modified. Returns false, otherwise.
1824bool SampleProfileLoader::emitAnnotations(Function &F) {
1825 bool Changed = false;
1826
1828 if (!ProbeManager->profileIsValid(F, *Samples)) {
1829 LLVM_DEBUG(
1830 dbgs() << "Profile is invalid due to CFG mismatch for Function "
1831 << F.getName() << "\n");
1832 ++NumMismatchedProfile;
1834 return false;
1835 }
1836 ++NumMatchedProfile;
1837 } else {
1838 if (getFunctionLoc(F) == 0)
1839 return false;
1840
1841 LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1842 << F.getName() << ": " << getFunctionLoc(F) << "\n");
1843 }
1844
1845 DenseSet<GlobalValue::GUID> InlinedGUIDs;
1847 Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
1848 else
1849 Changed |= inlineHotFunctions(F, InlinedGUIDs);
1850
1851 Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
1852
1853 if (Changed)
1854 generateMDProfMetadata(F);
1855
1856 emitCoverageRemarks(F);
1857 return Changed;
1858}
1859
1860std::unique_ptr<ProfiledCallGraph>
1861SampleProfileLoader::buildProfiledCallGraph(Module &M) {
1862 std::unique_ptr<ProfiledCallGraph> ProfiledCG;
1864 ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
1865 else
1866 ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
1867
1868 // Add all functions into the profiled call graph even if they are not in
1869 // the profile. This makes sure functions missing from the profile still
1870 // gets a chance to be processed.
1871 for (Function &F : M) {
1872 if (F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"))
1873 continue;
1874 ProfiledCG->addProfiledFunction(FunctionSamples::getCanonicalFnName(F));
1875 }
1876
1877 return ProfiledCG;
1878}
1879
1880std::vector<Function *>
1881SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) {
1882 std::vector<Function *> FunctionOrderList;
1883 FunctionOrderList.reserve(M.size());
1884
1886 errs() << "WARNING: -use-profiled-call-graph ignored, should be used "
1887 "together with -sample-profile-top-down-load.\n";
1888
1889 if (!ProfileTopDownLoad) {
1890 if (ProfileMergeInlinee) {
1891 // Disable ProfileMergeInlinee if profile is not loaded in top down order,
1892 // because the profile for a function may be used for the profile
1893 // annotation of its outline copy before the profile merging of its
1894 // non-inlined inline instances, and that is not the way how
1895 // ProfileMergeInlinee is supposed to work.
1896 ProfileMergeInlinee = false;
1897 }
1898
1899 for (Function &F : M)
1900 if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
1901 FunctionOrderList.push_back(&F);
1902 return FunctionOrderList;
1903 }
1904
1906 !UseProfiledCallGraph.getNumOccurrences())) {
1907 // Use profiled call edges to augment the top-down order. There are cases
1908 // that the top-down order computed based on the static call graph doesn't
1909 // reflect real execution order. For example
1910 //
1911 // 1. Incomplete static call graph due to unknown indirect call targets.
1912 // Adjusting the order by considering indirect call edges from the
1913 // profile can enable the inlining of indirect call targets by allowing
1914 // the caller processed before them.
1915 // 2. Mutual call edges in an SCC. The static processing order computed for
1916 // an SCC may not reflect the call contexts in the context-sensitive
1917 // profile, thus may cause potential inlining to be overlooked. The
1918 // function order in one SCC is being adjusted to a top-down order based
1919 // on the profile to favor more inlining. This is only a problem with CS
1920 // profile.
1921 // 3. Transitive indirect call edges due to inlining. When a callee function
1922 // (say B) is inlined into a caller function (say A) in LTO prelink,
1923 // every call edge originated from the callee B will be transferred to
1924 // the caller A. If any transferred edge (say A->C) is indirect, the
1925 // original profiled indirect edge B->C, even if considered, would not
1926 // enforce a top-down order from the caller A to the potential indirect
1927 // call target C in LTO postlink since the inlined callee B is gone from
1928 // the static call graph.
1929 // 4. #3 can happen even for direct call targets, due to functions defined
1930 // in header files. A header function (say A), when included into source
1931 // files, is defined multiple times but only one definition survives due
1932 // to ODR. Therefore, the LTO prelink inlining done on those dropped
1933 // definitions can be useless based on a local file scope. More
1934 // importantly, the inlinee (say B), once fully inlined to a
1935 // to-be-dropped A, will have no profile to consume when its outlined
1936 // version is compiled. This can lead to a profile-less prelink
1937 // compilation for the outlined version of B which may be called from
1938 // external modules. while this isn't easy to fix, we rely on the
1939 // postlink AutoFDO pipeline to optimize B. Since the survived copy of
1940 // the A can be inlined in its local scope in prelink, it may not exist
1941 // in the merged IR in postlink, and we'll need the profiled call edges
1942 // to enforce a top-down order for the rest of the functions.
1943 //
1944 // Considering those cases, a profiled call graph completely independent of
1945 // the static call graph is constructed based on profile data, where
1946 // function objects are not even needed to handle case #3 and case 4.
1947 //
1948 // Note that static callgraph edges are completely ignored since they
1949 // can be conflicting with profiled edges for cyclic SCCs and may result in
1950 // an SCC order incompatible with profile-defined one. Using strictly
1951 // profile order ensures a maximum inlining experience. On the other hand,
1952 // static call edges are not so important when they don't correspond to a
1953 // context in the profile.
1954
1955 std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M);
1956 scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
1957 while (!CGI.isAtEnd()) {
1958 auto Range = *CGI;
1959 if (SortProfiledSCC) {
1960 // Sort nodes in one SCC based on callsite hotness.
1962 Range = *SI;
1963 }
1964 for (auto *Node : Range) {
1965 Function *F = SymbolMap.lookup(Node->Name);
1966 if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
1967 FunctionOrderList.push_back(F);
1968 }
1969 ++CGI;
1970 }
1971 } else {
1972 CG.buildRefSCCs();
1973 for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
1974 for (LazyCallGraph::SCC &C : RC) {
1975 for (LazyCallGraph::Node &N : C) {
1976 Function &F = N.getFunction();
1977 if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
1978 FunctionOrderList.push_back(&F);
1979 }
1980 }
1981 }
1982 }
1983
1984 std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
1985
1986 LLVM_DEBUG({
1987 dbgs() << "Function processing order:\n";
1988 for (auto F : FunctionOrderList) {
1989 dbgs() << F->getName() << "\n";
1990 }
1991 });
1992
1993 return FunctionOrderList;
1994}
1995
1996bool SampleProfileLoader::doInitialization(Module &M,
1998 auto &Ctx = M.getContext();
1999
2000 auto ReaderOrErr = SampleProfileReader::create(
2001 Filename, Ctx, *FS, FSDiscriminatorPass::Base, RemappingFilename);
2002 if (std::error_code EC = ReaderOrErr.getError()) {
2003 std::string Msg = "Could not open profile: " + EC.message();
2004 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
2005 return false;
2006 }
2007 Reader = std::move(ReaderOrErr.get());
2008 Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
2009 // set module before reading the profile so reader may be able to only
2010 // read the function profiles which are used by the current module.
2011 Reader->setModule(&M);
2012 if (std::error_code EC = Reader->read()) {
2013 std::string Msg = "profile reading failed: " + EC.message();
2014 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
2015 return false;
2016 }
2017
2018 PSL = Reader->getProfileSymbolList();
2019
2020 // While profile-sample-accurate is on, ignore symbol list.
2021 ProfAccForSymsInList =
2023 if (ProfAccForSymsInList) {
2024 NamesInProfile.clear();
2025 if (auto NameTable = Reader->getNameTable())
2026 NamesInProfile.insert(NameTable->begin(), NameTable->end());
2027 CoverageTracker.setProfAccForSymsInList(true);
2028 }
2029
2030 if (FAM && !ProfileInlineReplayFile.empty()) {
2031 ExternalInlineAdvisor = getReplayInlineAdvisor(
2032 M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
2037 /*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});
2038 }
2039
2040 // Apply tweaks if context-sensitive or probe-based profile is available.
2041 if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||
2042 Reader->profileIsProbeBased()) {
2043 if (!UseIterativeBFIInference.getNumOccurrences())
2045 if (!SampleProfileUseProfi.getNumOccurrences())
2046 SampleProfileUseProfi = true;
2047 if (!EnableExtTspBlockPlacement.getNumOccurrences())
2049 // Enable priority-base inliner and size inline by default for CSSPGO.
2050 if (!ProfileSizeInline.getNumOccurrences())
2051 ProfileSizeInline = true;
2052 if (!CallsitePrioritizedInline.getNumOccurrences())
2054 // For CSSPGO, we also allow recursive inline to best use context profile.
2055 if (!AllowRecursiveInline.getNumOccurrences())
2056 AllowRecursiveInline = true;
2057
2058 if (Reader->profileIsPreInlined()) {
2059 if (!UsePreInlinerDecision.getNumOccurrences())
2060 UsePreInlinerDecision = true;
2061 }
2062
2063 // Enable stale profile matching by default for probe-based profile.
2064 // Currently the matching relies on if the checksum mismatch is detected,
2065 // which is currently only available for pseudo-probe mode. Removing the
2066 // checksum check could cause regressions for some cases, so further tuning
2067 // might be needed if we want to enable it for all cases.
2068 if (Reader->profileIsProbeBased() &&
2069 !SalvageStaleProfile.getNumOccurrences()) {
2070 SalvageStaleProfile = true;
2071 }
2072
2073 if (!Reader->profileIsCS()) {
2074 // Non-CS profile should be fine without a function size budget for the
2075 // inliner since the contexts in the profile are either all from inlining
2076 // in the prevoius build or pre-computed by the preinliner with a size
2077 // cap, thus they are bounded.
2078 if (!ProfileInlineLimitMin.getNumOccurrences())
2079 ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();
2080 if (!ProfileInlineLimitMax.getNumOccurrences())
2081 ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();
2082 }
2083 }
2084
2085 if (Reader->profileIsCS()) {
2086 // Tracker for profiles under different context
2087 ContextTracker = std::make_unique<SampleContextTracker>(
2088 Reader->getProfiles(), &GUIDToFuncNameMap);
2089 }
2090
2091 // Load pseudo probe descriptors for probe-based function samples.
2092 if (Reader->profileIsProbeBased()) {
2093 ProbeManager = std::make_unique<PseudoProbeManager>(M);
2094 if (!ProbeManager->moduleIsProbed(M)) {
2095 const char *Msg =
2096 "Pseudo-probe-based profile requires SampleProfileProbePass";
2097 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
2098 DS_Warning));
2099 return false;
2100 }
2101 }
2102
2105 MatchingManager =
2106 std::make_unique<SampleProfileMatcher>(M, *Reader, ProbeManager.get());
2107 }
2108
2109 return true;
2110}
2111
2112void SampleProfileMatcher::findIRAnchors(
2113 const Function &F, std::map<LineLocation, StringRef> &IRAnchors) {
2114 // For inlined code, recover the original callsite and callee by finding the
2115 // top-level inline frame. e.g. For frame stack "main:1 @ foo:2 @ bar:3", the
2116 // top-level frame is "main:1", the callsite is "1" and the callee is "foo".
2117 auto FindTopLevelInlinedCallsite = [](const DILocation *DIL) {
2118 assert((DIL && DIL->getInlinedAt()) && "No inlined callsite");
2119 const DILocation *PrevDIL = nullptr;
2120 do {
2121 PrevDIL = DIL;
2122 DIL = DIL->getInlinedAt();
2123 } while (DIL->getInlinedAt());
2124
2126 StringRef CalleeName = PrevDIL->getSubprogramLinkageName();
2127 return std::make_pair(Callsite, CalleeName);
2128 };
2129
2130 auto GetCanonicalCalleeName = [](const CallBase *CB) {
2131 StringRef CalleeName = UnknownIndirectCallee;
2132 if (Function *Callee = CB->getCalledFunction())
2133 CalleeName = FunctionSamples::getCanonicalFnName(Callee->getName());
2134 return CalleeName;
2135 };
2136
2137 // Extract profile matching anchors in the IR.
2138 for (auto &BB : F) {
2139 for (auto &I : BB) {
2140 DILocation *DIL = I.getDebugLoc();
2141 if (!DIL)
2142 continue;
2143
2145 if (auto Probe = extractProbe(I)) {
2146 // Flatten inlined IR for the matching.
2147 if (DIL->getInlinedAt()) {
2148 IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL));
2149 } else {
2150 // Use empty StringRef for basic block probe.
2151 StringRef CalleeName;
2152 if (const auto *CB = dyn_cast<CallBase>(&I)) {
2153 // Skip the probe inst whose callee name is "llvm.pseudoprobe".
2154 if (!isa<IntrinsicInst>(&I))
2155 CalleeName = GetCanonicalCalleeName(CB);
2156 }
2157 IRAnchors.emplace(LineLocation(Probe->Id, 0), CalleeName);
2158 }
2159 }
2160 } else {
2161 // TODO: For line-number based profile(AutoFDO), currently only support
2162 // find callsite anchors. In future, we need to parse all the non-call
2163 // instructions to extract the line locations for profile matching.
2164 if (!isa<CallBase>(&I) || isa<IntrinsicInst>(&I))
2165 continue;
2166
2167 if (DIL->getInlinedAt()) {
2168 IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL));
2169 } else {
2171 StringRef CalleeName = GetCanonicalCalleeName(dyn_cast<CallBase>(&I));
2172 IRAnchors.emplace(Callsite, CalleeName);
2173 }
2174 }
2175 }
2176 }
2177}
2178
2179void SampleProfileMatcher::countMismatchedSamples(const FunctionSamples &FS) {
2180 const auto *FuncDesc = ProbeManager->getDesc(FS.getName());
2181 // Skip the function that is external or renamed.
2182 if (!FuncDesc)
2183 return;
2184
2185 if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS)) {
2186 MismatchedFuncHashSamples += FS.getTotalSamples();
2187 return;
2188 }
2189 for (const auto &I : FS.getCallsiteSamples())
2190 for (const auto &CS : I.second)
2191 countMismatchedSamples(CS.second);
2192}
2193
2194void SampleProfileMatcher::countProfileMismatches(
2195 const Function &F, const FunctionSamples &FS,
2196 const std::map<LineLocation, StringRef> &IRAnchors,
2197 const std::map<LineLocation, StringSet<>> &ProfileAnchors) {
2198 [[maybe_unused]] bool IsFuncHashMismatch = false;
2200 TotalFuncHashSamples += FS.getTotalSamples();
2201 TotalProfiledFunc++;
2202 const auto *FuncDesc = ProbeManager->getDesc(F);
2203 if (FuncDesc) {
2204 if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS)) {
2205 NumMismatchedFuncHash++;
2206 IsFuncHashMismatch = true;
2207 }
2208 countMismatchedSamples(FS);
2209 }
2210 }
2211
2212 uint64_t FuncMismatchedCallsites = 0;
2213 uint64_t FuncProfiledCallsites = 0;
2214 countProfileCallsiteMismatches(FS, IRAnchors, ProfileAnchors,
2215 FuncMismatchedCallsites,
2216 FuncProfiledCallsites);
2217 TotalProfiledCallsites += FuncProfiledCallsites;
2218 NumMismatchedCallsites += FuncMismatchedCallsites;
2219 LLVM_DEBUG({
2220 if (FunctionSamples::ProfileIsProbeBased && !IsFuncHashMismatch &&
2221 FuncMismatchedCallsites)
2222 dbgs() << "Function checksum is matched but there are "
2223 << FuncMismatchedCallsites << "/" << FuncProfiledCallsites
2224 << " mismatched callsites.\n";
2225 });
2226}
2227
2228void SampleProfileMatcher::countProfileCallsiteMismatches(
2229 const FunctionSamples &FS,
2230 const std::map<LineLocation, StringRef> &IRAnchors,
2231 const std::map<LineLocation, StringSet<>> &ProfileAnchors,
2232 uint64_t &FuncMismatchedCallsites, uint64_t &FuncProfiledCallsites) {
2233
2234 // Check if there are any callsites in the profile that does not match to any
2235 // IR callsites, those callsite samples will be discarded.
2236 for (const auto &I : ProfileAnchors) {
2237 const auto &Loc = I.first;
2238 const auto &Callees = I.second;
2239 assert(!Callees.empty() && "Callees should not be empty");
2240
2241 StringRef IRCalleeName;
2242 const auto &IR = IRAnchors.find(Loc);
2243 if (IR != IRAnchors.end())
2244 IRCalleeName = IR->second;
2245
2246 // Compute number of samples in the original profile.
2247 uint64_t CallsiteSamples = 0;
2248 auto CTM = FS.findCallTargetMapAt(Loc);
2249 if (CTM) {
2250 for (const auto &I : CTM.get())
2251 CallsiteSamples += I.second;
2252 }
2253 const auto *FSMap = FS.findFunctionSamplesMapAt(Loc);
2254 if (FSMap) {
2255 for (const auto &I : *FSMap)
2256 CallsiteSamples += I.second.getTotalSamples();
2257 }
2258
2259 bool CallsiteIsMatched = false;
2260 // Since indirect call does not have CalleeName, check conservatively if
2261 // callsite in the profile is a callsite location. This is to reduce num of
2262 // false positive since otherwise all the indirect call samples will be
2263 // reported as mismatching.
2264 if (IRCalleeName == UnknownIndirectCallee)
2265 CallsiteIsMatched = true;
2266 else if (Callees.size() == 1 && Callees.count(IRCalleeName))
2267 CallsiteIsMatched = true;
2268
2269 FuncProfiledCallsites++;
2270 TotalCallsiteSamples += CallsiteSamples;
2271 if (!CallsiteIsMatched) {
2272 FuncMismatchedCallsites++;
2273 MismatchedCallsiteSamples += CallsiteSamples;
2274 }
2275 }
2276}
2277
2278void SampleProfileMatcher::findProfileAnchors(
2279 const FunctionSamples &FS,
2280 std::map<LineLocation, StringSet<>> &ProfileAnchors) {
2281 auto isInvalidLineOffset = [](uint32_t LineOffset) {
2282 return LineOffset & 0x8000;
2283 };
2284
2285 for (const auto &I : FS.getBodySamples()) {
2286 const LineLocation &Loc = I.first;
2287 if (isInvalidLineOffset(Loc.LineOffset))
2288 continue;
2289 for (const auto &I : I.second.getCallTargets()) {
2290 auto Ret = ProfileAnchors.try_emplace(Loc, StringSet<>());
2291 Ret.first->second.insert(I.first());
2292 }
2293 }
2294
2295 for (const auto &I : FS.getCallsiteSamples()) {
2296 const LineLocation &Loc = I.first;
2297 if (isInvalidLineOffset(Loc.LineOffset))
2298 continue;
2299 const auto &CalleeMap = I.second;
2300 for (const auto &I : CalleeMap) {
2301 auto Ret = ProfileAnchors.try_emplace(Loc, StringSet<>());
2302 Ret.first->second.insert(I.first);
2303 }
2304 }
2305}
2306
2307// Call target name anchor based profile fuzzy matching.
2308// Input:
2309// For IR locations, the anchor is the callee name of direct callsite; For
2310// profile locations, it's the call target name for BodySamples or inlinee's
2311// profile name for CallsiteSamples.
2312// Matching heuristic:
2313// First match all the anchors in lexical order, then split the non-anchor
2314// locations between the two anchors evenly, first half are matched based on the
2315// start anchor, second half are matched based on the end anchor.
2316// For example, given:
2317// IR locations: [1, 2(foo), 3, 5, 6(bar), 7]
2318// Profile locations: [1, 2, 3(foo), 4, 7, 8(bar), 9]
2319// The matching gives:
2320// [1, 2(foo), 3, 5, 6(bar), 7]
2321// | | | | | |
2322// [1, 2, 3(foo), 4, 7, 8(bar), 9]
2323// The output mapping: [2->3, 3->4, 5->7, 6->8, 7->9].
2324void SampleProfileMatcher::runStaleProfileMatching(
2325 const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
2326 const std::map<LineLocation, StringSet<>> &ProfileAnchors,
2327 LocToLocMap &IRToProfileLocationMap) {
2328 LLVM_DEBUG(dbgs() << "Run stale profile matching for " << F.getName()
2329 << "\n");
2330 assert(IRToProfileLocationMap.empty() &&
2331 "Run stale profile matching only once per function");
2332
2333 StringMap<std::set<LineLocation>> CalleeToCallsitesMap;
2334 for (const auto &I : ProfileAnchors) {
2335 const auto &Loc = I.first;
2336 const auto &Callees = I.second;
2337 // Filter out possible indirect calls, use direct callee name as anchor.
2338 if (Callees.size() == 1) {
2339 StringRef CalleeName = Callees.begin()->first();
2340 const auto &Candidates = CalleeToCallsitesMap.try_emplace(
2341 CalleeName, std::set<LineLocation>());
2342 Candidates.first->second.insert(Loc);
2343 }
2344 }
2345
2346 auto InsertMatching = [&](const LineLocation &From, const LineLocation &To) {
2347 // Skip the unchanged location mapping to save memory.
2348 if (From != To)
2349 IRToProfileLocationMap.insert({From, To});
2350 };
2351
2352 // Use function's beginning location as the initial anchor.
2353 int32_t LocationDelta = 0;
2354 SmallVector<LineLocation> LastMatchedNonAnchors;
2355
2356 for (const auto &IR : IRAnchors) {
2357 const auto &Loc = IR.first;
2358 StringRef CalleeName = IR.second;
2359 bool IsMatchedAnchor = false;
2360 // Match the anchor location in lexical order.
2361 if (!CalleeName.empty()) {
2362 auto CandidateAnchors = CalleeToCallsitesMap.find(CalleeName);
2363 if (CandidateAnchors != CalleeToCallsitesMap.end() &&
2364 !CandidateAnchors->second.empty()) {
2365 auto CI = CandidateAnchors->second.begin();
2366 const auto Candidate = *CI;
2367 CandidateAnchors->second.erase(CI);
2368 InsertMatching(Loc, Candidate);
2369 LLVM_DEBUG(dbgs() << "Callsite with callee:" << CalleeName
2370 << " is matched from " << Loc << " to " << Candidate
2371 << "\n");
2372 LocationDelta = Candidate.LineOffset - Loc.LineOffset;
2373
2374 // Match backwards for non-anchor locations.
2375 // The locations in LastMatchedNonAnchors have been matched forwards
2376 // based on the previous anchor, spilt it evenly and overwrite the
2377 // second half based on the current anchor.
2378 for (size_t I = (LastMatchedNonAnchors.size() + 1) / 2;
2379 I < LastMatchedNonAnchors.size(); I++) {
2380 const auto &L = LastMatchedNonAnchors[I];
2381 uint32_t CandidateLineOffset = L.LineOffset + LocationDelta;
2382 LineLocation Candidate(CandidateLineOffset, L.Discriminator);
2383 InsertMatching(L, Candidate);
2384 LLVM_DEBUG(dbgs() << "Location is rematched backwards from " << L
2385 << " to " << Candidate << "\n");
2386 }
2387
2388 IsMatchedAnchor = true;
2389 LastMatchedNonAnchors.clear();
2390 }
2391 }
2392
2393 // Match forwards for non-anchor locations.
2394 if (!IsMatchedAnchor) {
2395 uint32_t CandidateLineOffset = Loc.LineOffset + LocationDelta;
2396 LineLocation Candidate(CandidateLineOffset, Loc.Discriminator);
2397 InsertMatching(Loc, Candidate);
2398 LLVM_DEBUG(dbgs() << "Location is matched from " << Loc << " to "
2399 << Candidate << "\n");
2400 LastMatchedNonAnchors.emplace_back(Loc);
2401 }
2402 }
2403}
2404
2405void SampleProfileMatcher::runOnFunction(const Function &F) {
2406 // We need to use flattened function samples for matching.
2407 // Unlike IR, which includes all callsites from the source code, the callsites
2408 // in profile only show up when they are hit by samples, i,e. the profile
2409 // callsites in one context may differ from those in another context. To get
2410 // the maximum number of callsites, we merge the function profiles from all
2411 // contexts, aka, the flattened profile to find profile anchors.
2412 const auto *FSFlattened = getFlattenedSamplesFor(F);
2413 if (!FSFlattened)
2414 return;
2415
2416 // Anchors for IR. It's a map from IR location to callee name, callee name is
2417 // empty for non-call instruction and use a dummy name(UnknownIndirectCallee)
2418 // for unknown indrect callee name.
2419 std::map<LineLocation, StringRef> IRAnchors;
2420 findIRAnchors(F, IRAnchors);
2421 // Anchors for profile. It's a map from callsite location to a set of callee
2422 // name.
2423 std::map<LineLocation, StringSet<>> ProfileAnchors;
2424 findProfileAnchors(*FSFlattened, ProfileAnchors);
2425
2426 // Detect profile mismatch for profile staleness metrics report.
2427 // Skip reporting the metrics for imported functions.
2428 if (!GlobalValue::isAvailableExternallyLinkage(F.getLinkage()) &&
2430 // Use top-level nested FS for counting profile mismatch metrics since
2431 // currently once a callsite is mismatched, all its children profiles are
2432 // dropped.
2433 if (const auto *FS = Reader.getSamplesFor(F))
2434 countProfileMismatches(F, *FS, IRAnchors, ProfileAnchors);
2435 }
2436
2437 // Run profile matching for checksum mismatched profile, currently only
2438 // support for pseudo-probe.
2440 !ProbeManager->profileIsValid(F, *FSFlattened)) {
2441 // The matching result will be saved to IRToProfileLocationMap, create a new
2442 // map for each function.
2443 runStaleProfileMatching(F, IRAnchors, ProfileAnchors,
2444 getIRToProfileLocationMap(F));
2445 }
2446}
2447
2448void SampleProfileMatcher::runOnModule() {
2449 ProfileConverter::flattenProfile(Reader.getProfiles(), FlattenedProfiles,
2451 for (auto &F : M) {
2452 if (F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"))
2453 continue;
2455 }
2457 distributeIRToProfileLocationMap();
2458
2461 errs() << "(" << NumMismatchedFuncHash << "/" << TotalProfiledFunc << ")"
2462 << " of functions' profile are invalid and "
2463 << " (" << MismatchedFuncHashSamples << "/" << TotalFuncHashSamples
2464 << ")"
2465 << " of samples are discarded due to function hash mismatch.\n";
2466 }
2467 errs() << "(" << NumMismatchedCallsites << "/" << TotalProfiledCallsites
2468 << ")"
2469 << " of callsites' profile are invalid and "
2470 << "(" << MismatchedCallsiteSamples << "/" << TotalCallsiteSamples
2471 << ")"
2472 << " of samples are discarded due to callsite location mismatch.\n";
2473 }
2474
2476 LLVMContext &Ctx = M.getContext();
2477 MDBuilder MDB(Ctx);
2478
2481 ProfStatsVec.emplace_back("NumMismatchedFuncHash", NumMismatchedFuncHash);
2482 ProfStatsVec.emplace_back("TotalProfiledFunc", TotalProfiledFunc);
2483 ProfStatsVec.emplace_back("MismatchedFuncHashSamples",
2484 MismatchedFuncHashSamples);
2485 ProfStatsVec.emplace_back("TotalFuncHashSamples", TotalFuncHashSamples);
2486 }
2487
2488 ProfStatsVec.emplace_back("NumMismatchedCallsites", NumMismatchedCallsites);
2489 ProfStatsVec.emplace_back("TotalProfiledCallsites", TotalProfiledCallsites);
2490 ProfStatsVec.emplace_back("MismatchedCallsiteSamples",
2491 MismatchedCallsiteSamples);
2492 ProfStatsVec.emplace_back("TotalCallsiteSamples", TotalCallsiteSamples);
2493
2494 auto *MD = MDB.createLLVMStats(ProfStatsVec);
2495 auto *NMD = M.getOrInsertNamedMetadata("llvm.stats");
2496 NMD->addOperand(MD);
2497 }
2498}
2499
2500void SampleProfileMatcher::distributeIRToProfileLocationMap(
2501 FunctionSamples &FS) {
2502 const auto ProfileMappings = FuncMappings.find(FS.getName());
2503 if (ProfileMappings != FuncMappings.end()) {
2504 FS.setIRToProfileLocationMap(&(ProfileMappings->second));
2505 }
2506
2507 for (auto &Inlinees : FS.getCallsiteSamples()) {
2508 for (auto FS : Inlinees.second) {
2509 distributeIRToProfileLocationMap(FS.second);
2510 }
2511 }
2512}
2513
2514// Use a central place to distribute the matching results. Outlined and inlined
2515// profile with the function name will be set to the same pointer.
2516void SampleProfileMatcher::distributeIRToProfileLocationMap() {
2517 for (auto &I : Reader.getProfiles()) {
2518 distributeIRToProfileLocationMap(I.second);
2519 }
2520}
2521
2522bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
2523 ProfileSummaryInfo *_PSI,
2524 LazyCallGraph &CG) {
2525 GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
2526
2527 PSI = _PSI;
2528 if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
2529 M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
2531 PSI->refresh();
2532 }
2533 // Compute the total number of samples collected in this profile.
2534 for (const auto &I : Reader->getProfiles())
2535 TotalCollectedSamples += I.second.getTotalSamples();
2536
2537 auto Remapper = Reader->getRemapper();
2538 // Populate the symbol map.
2539 for (const auto &N_F : M.getValueSymbolTable()) {
2540 StringRef OrigName = N_F.getKey();
2541 Function *F = dyn_cast<Function>(N_F.getValue());
2542 if (F == nullptr || OrigName.empty())
2543 continue;
2544 SymbolMap[OrigName] = F;
2546 if (OrigName != NewName && !NewName.empty()) {
2547 auto r = SymbolMap.insert(std::make_pair(NewName, F));
2548 // Failiing to insert means there is already an entry in SymbolMap,
2549 // thus there are multiple functions that are mapped to the same
2550 // stripped name. In this case of name conflicting, set the value
2551 // to nullptr to avoid confusion.
2552 if (!r.second)
2553 r.first->second = nullptr;
2554 OrigName = NewName;
2555 }
2556 // Insert the remapped names into SymbolMap.
2557 if (Remapper) {
2558 if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
2559 if (*MapName != OrigName && !MapName->empty())
2560 SymbolMap.insert(std::make_pair(*MapName, F));
2561 }
2562 }
2563 }
2564 assert(SymbolMap.count(StringRef()) == 0 &&
2565 "No empty StringRef should be added in SymbolMap");
2566
2569 MatchingManager->runOnModule();
2570 }
2571
2572 bool retval = false;
2573 for (auto *F : buildFunctionOrder(M, CG)) {
2574 assert(!F->isDeclaration());
2575 clearFunctionData();
2576 retval |= runOnFunction(*F, AM);
2577 }
2578
2579 // Account for cold calls not inlined....
2581 for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
2582 notInlinedCallInfo)
2583 updateProfileCallee(pair.first, pair.second.entryCount);
2584
2585 return retval;
2586}
2587
2588bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
2589 LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
2590 DILocation2SampleMap.clear();
2591 // By default the entry count is initialized to -1, which will be treated
2592 // conservatively by getEntryCount as the same as unknown (None). This is
2593 // to avoid newly added code to be treated as cold. If we have samples
2594 // this will be overwritten in emitAnnotations.
2595 uint64_t initialEntryCount = -1;
2596
2597 ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
2598 if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
2599 // initialize all the function entry counts to 0. It means all the
2600 // functions without profile will be regarded as cold.
2601 initialEntryCount = 0;
2602 // profile-sample-accurate is a user assertion which has a higher precedence
2603 // than symbol list. When profile-sample-accurate is on, ignore symbol list.
2604 ProfAccForSymsInList = false;
2605 }
2606 CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
2607
2608 // PSL -- profile symbol list include all the symbols in sampled binary.
2609 // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
2610 // old functions without samples being cold, without having to worry
2611 // about new and hot functions being mistakenly treated as cold.
2612 if (ProfAccForSymsInList) {
2613 // Initialize the entry count to 0 for functions in the list.
2614 if (PSL->contains(F.getName()))
2615 initialEntryCount = 0;
2616
2617 // Function in the symbol list but without sample will be regarded as
2618 // cold. To minimize the potential negative performance impact it could
2619 // have, we want to be a little conservative here saying if a function
2620 // shows up in the profile, no matter as outline function, inline instance
2621 // or call targets, treat the function as not being cold. This will handle
2622 // the cases such as most callsites of a function are inlined in sampled
2623 // binary but not inlined in current build (because of source code drift,
2624 // imprecise debug information, or the callsites are all cold individually
2625 // but not cold accumulatively...), so the outline function showing up as
2626 // cold in sampled binary will actually not be cold after current build.
2628 if (NamesInProfile.count(CanonName))
2629 initialEntryCount = -1;
2630 }
2631
2632 // Initialize entry count when the function has no existing entry
2633 // count value.
2634 if (!F.getEntryCount())
2635 F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
2636 std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
2637 if (AM) {
2638 auto &FAM =
2640 .getManager();
2642 } else {
2643 OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
2644 ORE = OwnedORE.get();
2645 }
2646
2648 Samples = ContextTracker->getBaseSamplesFor(F);
2649 else {
2650 Samples = Reader->getSamplesFor(F);
2651 // Try search in previously inlined functions that were split or duplicated
2652 // into base.
2653 if (!Samples) {
2655 auto It = OutlineFunctionSamples.find(CanonName);
2656 if (It != OutlineFunctionSamples.end()) {
2657 Samples = &It->second;
2658 } else if (auto Remapper = Reader->getRemapper()) {
2659 if (auto RemppedName = Remapper->lookUpNameInProfile(CanonName)) {
2660 It = OutlineFunctionSamples.find(*RemppedName);
2661 if (It != OutlineFunctionSamples.end())
2662 Samples = &It->second;
2663 }
2664 }
2665 }
2666 }
2667
2668 if (Samples && !Samples->empty())
2669 return emitAnnotations(F);
2670 return false;
2671}
2673 std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase,
2675 : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
2676 LTOPhase(LTOPhase), FS(std::move(FS)) {}
2677
2682
2683 auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
2685 };
2686 auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
2688 };
2689 auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
2691 };
2692
2693 if (!FS)
2695
2696 SampleProfileLoader SampleLoader(
2697 ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
2698 ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
2699 : ProfileRemappingFileName,
2700 LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI);
2701
2702 if (!SampleLoader.doInitialization(M, &FAM))
2703 return PreservedAnalyses::all();
2704
2707 if (!SampleLoader.runOnModule(M, &AM, PSI, CG))
2708 return PreservedAnalyses::all();
2709
2710 return PreservedAnalyses::none();
2711}
This file defines the StringMap class.
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:680
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
static bool runOnFunction(Function &F, bool PostInlining)
Provides ErrorOr<T> smart pointer.
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
LVReader * CurrentReader
Definition: LVReader.cpp:153
Implements a lazy call graph analysis and related passes for the new pass manager.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PriorityQueue class.
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides the interface for context-sensitive profile tracker used by CSSPGO.
This file provides the interface for the sampled PGO profile loader base implementation.
This file provides the utility functions for the sampled PGO loader base implementation.
This file provides the interface for the pseudo probe implementation for AutoFDO.
static cl::opt< std::string > SampleProfileFile("sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden)
static cl::opt< bool > ProfileSampleBlockAccurate("profile-sample-block-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "branches and calls as having 0 samples. Otherwise, treat " "them conservatively as unknown. "))
static cl::opt< unsigned > MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden, cl::desc("Max number of promotions for a single indirect " "call callsite in sample profile loader"))
static cl::opt< ReplayInlinerSettings::Fallback > ProfileInlineReplayFallback("sample-profile-inline-replay-fallback", cl::init(ReplayInlinerSettings::Fallback::Original), cl::values(clEnumValN(ReplayInlinerSettings::Fallback::Original, "Original", "All decisions not in replay send to original advisor (default)"), clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, "AlwaysInline", "All decisions not in replay are inlined"), clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline", "All decisions not in replay are not inlined")), cl::desc("How sample profile inline replay treats sites that don't come " "from the replay. Original: defers to original advisor, " "AlwaysInline: inline all sites not in replay, NeverInline: " "inline no sites not in replay"), cl::Hidden)
static cl::opt< bool > OverwriteExistingWeights("overwrite-existing-weights", cl::Hidden, cl::init(false), cl::desc("Ignore existing branch weights on IR and always overwrite."))
static void updateIDTMetaData(Instruction &Inst, const SmallVectorImpl< InstrProfValueData > &CallTargets, uint64_t Sum)
Update indirect call target profile metadata for Inst.
static cl::opt< bool > AnnotateSampleProfileInlinePhase("annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false), cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " "sample-profile inline pass name."))
static cl::opt< std::string > ProfileInlineReplayFile("sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"), cl::desc("Optimization remarks file containing inline remarks to be replayed " "by inlining from sample profile loader."), cl::Hidden)
static cl::opt< bool > ProfileMergeInlinee("sample-profile-merge-inlinee", cl::Hidden, cl::init(true), cl::desc("Merge past inlinee's profile to outline version if sample " "profile loader decided not to inline a call site. It will " "only be enabled when top-down order of profile loading is " "enabled. "))
static cl::opt< bool > PersistProfileStaleness("persist-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute stale profile statistical metrics and write it into the " "native object file(.llvm_stats section)."))
static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate)
Check whether the indirect call promotion history of Inst allows the promotion for Candidate.
static SmallVector< InstrProfValueData, 2 > GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M)
Returns the sorted CallTargetMap M by count in descending order.
#define CSINLINE_DEBUG
static cl::opt< bool > UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden, cl::desc("Process functions in a top-down order " "defined by the profiled call graph when " "-sample-profile-top-down-load is on."))
static cl::opt< ReplayInlinerSettings::Scope > ProfileInlineReplayScope("sample-profile-inline-replay-scope", cl::init(ReplayInlinerSettings::Scope::Function), cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function", "Replay on functions that have remarks associated " "with them (default)"), clEnumValN(ReplayInlinerSettings::Scope::Module, "Module", "Replay on the entire module")), cl::desc("Whether inline replay should be applied to the entire " "Module or just the Functions (default) that are present as " "callers in remarks during sample profile inlining."), cl::Hidden)
static cl::opt< unsigned > ProfileICPRelativeHotness("sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25), cl::desc("Relative hotness percentage threshold for indirect " "call promotion in proirity-based sample profile loader inlining."))
Function::ProfileCount ProfileCount
static cl::opt< unsigned > ProfileICPRelativeHotnessSkip("sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1), cl::desc("Skip relative hotness check for ICP up to given number of targets."))
static cl::opt< bool > ReportProfileStaleness("report-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute and report stale profile statistical metrics."))
static cl::opt< bool > UsePreInlinerDecision("sample-profile-use-preinliner", cl::Hidden, cl::desc("Use the preinliner decisions stored in profile context."))
static cl::opt< bool > ProfileAccurateForSymsInList("profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. "))
#define DEBUG_TYPE
static cl::opt< bool > DisableSampleLoaderInlining("disable-sample-loader-inlining", cl::Hidden, cl::init(false), cl::desc("If true, artifically skip inline transformation in sample-loader " "pass, and merge (or scale) profiles (as configured by " "--sample-profile-merge-inlinee)."))
static cl::opt< bool > ProfileSizeInline("sample-profile-inline-size", cl::Hidden, cl::init(false), cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size."))
static cl::opt< bool > SalvageStaleProfile("salvage-stale-profile", cl::Hidden, cl::init(false), cl::desc("Salvage stale profile by fuzzy matching and use the remapped " "location for sample profile query."))
static cl::opt< bool > ProfileTopDownLoad("sample-profile-top-down-load", cl::Hidden, cl::init(true), cl::desc("Do profile annotation and inlining for functions in top-down " "order of call graph during sample profile loading. It only " "works for new pass manager. "))
static cl::opt< bool > ProfileSampleAccurate("profile-sample-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. "))
static cl::opt< bool > AllowRecursiveInline("sample-profile-recursive-inline", cl::Hidden, cl::desc("Allow sample loader inliner to inline recursive calls."))
static cl::opt< CallSiteFormat::Format > ProfileInlineReplayFormat("sample-profile-inline-replay-format", cl::init(CallSiteFormat::Format::LineColumnDiscriminator), cl::values(clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"), clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn", "<Line Number>:<Column Number>"), clEnumValN(CallSiteFormat::Format::LineDiscriminator, "LineDiscriminator", "<Line Number>.<Discriminator>"), clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, "LineColumnDiscriminator", "<Line Number>:<Column Number>.<Discriminator> (default)")), cl::desc("How sample profile inline replay file is formatted"), cl::Hidden)
static cl::opt< std::string > SampleProfileRemappingFile("sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden)
static cl::opt< bool > CallsitePrioritizedInline("sample-profile-prioritized-inline", cl::Hidden, cl::desc("Use call site prioritized inlining for sample profile loader." "Currently only CSSPGO is supported."))
This file provides the interface for the sampled PGO loader pass.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1190
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1412
This class represents a function call, abstracting a target machine's calling convention.
Debug location.
A debug info location.
Definition: DebugLoc.h:33
unsigned getLine() const
Definition: DebugLoc.cpp:24
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Diagnostic information for the sample profiler.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
Represents either an error or a value T.
Definition: ErrorOr.h:56
Class to represent profile counts.
Definition: Function.h:254
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1727
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:273
static bool isAvailableExternallyLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:374
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:202
InlineResult is basically true or false.
Definition: InlineCost.h:179
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:933
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:392
bool extractProfTotalWeight(uint64_t &TotalVal) const
Retrieve total raw weight values of a branch.
Definition: Metadata.cpp:1641
const BasicBlock * getParent() const
Definition: Instruction.h:90
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1521
A smart pointer to a reference-counted object that inherits from RefCountedBase or ThreadSafeRefCount...
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
An analysis pass which computes the call graph for a module.
A node in the call graph.
A RefSCC of the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
VectorType::iterator erase(typename VectorType::iterator Iterator)
Remove the element given by Iterator.
Definition: MapVector.h:172
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:117
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Diagnostic information for optimization analysis remarks.
Diagnostic information for applied optimization remarks.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
Metadata * getMD(LLVMContext &Context, bool AddPartialField=true, bool AddPartialProfileRatioField=true)
Return summary information as metadata.
bool profileIsHashMismatched(const PseudoProbeDescriptor &FuncDesc, const FunctionSamples &Samples) const
bool moduleIsProbed(const Module &M) const
bool profileIsValid(const Function &F, const FunctionSamples &Samples) const
const PseudoProbeDescriptor * getDesc(uint64_t GUID) const
Sample profile inference pass.
void computeDominanceAndLoopInfo(FunctionT &F)
virtual ErrorOr< uint64_t > getInstWeight(const InstructionT &Inst)
Get the weight for an instruction.
virtual const FunctionSamples * findFunctionSamples(const InstructionT &I) const
Get the FunctionSamples for an instruction.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
SampleProfileLoaderPass(std::string File="", std::string RemappingFile="", ThinOrFullLTOPhase LTOPhase=ThinOrFullLTOPhase::None, IntrusiveRefCntPtr< vfs::FileSystem > FS=nullptr)
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:809
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:112
iterator end()
Definition: StringMap.h:205
iterator find(StringRef Key)
Definition: StringMap.h:218
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition: StringMap.h:341
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
iterator begin() const
Definition: StringRef.h:111
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:23
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
Representation of the samples collected for a function.
Definition: SampleProf.h:751
static uint64_t getGUID(StringRef Name)
Definition: SampleProf.h:1203
void findInlinedFunctions(DenseSet< GlobalValue::GUID > &S, const StringMap< Function * > &SymbolMap, uint64_t Threshold) const
Recursively traverses all children, if the total sample count of the corresponding function is no les...
Definition: SampleProf.h:1041
static StringRef getCanonicalFnName(const Function &F)
Return the canonical name for a function, taking into account suffix elision policy attributes.
Definition: SampleProf.h:1087
StringRef getFuncName() const
Return the original function name.
Definition: SampleProf.h:1074
SampleContext & getContext() const
Definition: SampleProf.h:1183
sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight=1)
Merge the samples in Other into this one.
Definition: SampleProf.h:1001
static LineLocation getCallSiteIdentifier(const DILocation *DIL, bool ProfileIsFS=false)
Returns a unique call site identifier for a given debug location of a call instruction.
Definition: SampleProf.cpp:220
uint64_t getHeadSamplesEstimate() const
Return an estimate of the sample count of the function entry basic block.
Definition: SampleProf.h:952
StringRef getName() const
Return the function name.
Definition: SampleProf.h:1071
const BodySampleMap & getBodySamples() const
Return all the samples collected in the body of the function.
Definition: SampleProf.h:976
static bool UseMD5
Whether the profile uses MD5 to represent string.
Definition: SampleProf.h:1188
static void flattenProfile(SampleProfileMap &ProfileMap, bool ProfileIsCS=false)
Definition: SampleProf.h:1496
bool hasAttribute(ContextAttributeMask A)
Definition: SampleProf.h:612
This class provides operator overloads to the map container using MD5 as the key type,...
Definition: SampleProf.h:1382
iterator find(const SampleContext &Ctx)
Definition: SampleProf.h:1393
Sample-based profile reader.
SampleProfileMap & getProfiles()
Return all the profiles.
bool profileIsProbeBased() const
Whether input profile is based on pseudo probes.
FunctionSamples * getSamplesFor(const Function &F)
Return the samples collected for function F.
bool profileIsPreInlined() const
Whether input profile contains ShouldBeInlined contexts.
std::error_code read()
The interface to read sample profiles from the associated file.
SampleProfileReaderItaniumRemapper * getRemapper()
virtual std::vector< StringRef > * getNameTable()
It includes all the names that have samples either in outline instance or inline instance.
ProfileSummary & getSummary() const
Return the profile summary.
bool profileIsCS() const
Whether input profile is fully context-sensitive.
virtual void setSkipFlatProf(bool Skip)
Don't read profile without context if the flag is set.
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(const std::string Filename, LLVMContext &C, vfs::FileSystem &FS, FSDiscriminatorPass P=FSDiscriminatorPass::Base, const std::string RemapFilename="")
Create a sample profile reader appropriate to the file format.
virtual std::unique_ptr< ProfileSymbolList > getProfileSymbolList()
static const SortedCallTargetSet SortCallTargets(const CallTargetMap &Targets)
Sort call targets in descending order of call frequency.
Definition: SampleProf.h:420
static const CallTargetMap adjustCallTargets(const CallTargetMap &Targets, float DistributionFactor)
Prorate call targets by a distribution factor.
Definition: SampleProf.h:429
Enumerate the SCCs of a directed graph in reverse topological order of the SCC DAG.
Definition: SCCIterator.h:49
bool isAtEnd() const
Direct loop termination test which is more efficient than comparison with end().
Definition: SCCIterator.h:113
Sort the nodes of a directed SCC in the decreasing order of the edge weights.
Definition: SCCIterator.h:253
const CustomOperand< const MCSubtargetInfo & > Msg[]
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FS
Definition: X86.h:207
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:705
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
void checkExpectAnnotations(Instruction &I, const ArrayRef< uint32_t > ExistingWeights, bool IsFrontend)
checkExpectAnnotations - compares PGO counters to the thresholds used for llvm.expect and warns if th...
Definition: MisExpect.cpp:202
DenseMap< SymbolStringPtr, ExecutorSymbolDef > SymbolMap
A map from symbol names (as SymbolStringPtrs) to JITSymbols (address/flags pairs).
Definition: Core.h:121
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
std::unordered_map< LineLocation, LineLocation, LineLocationHash > LocToLocMap
Definition: SampleProf.h:744
std::map< std::string, FunctionSamples, std::less<> > FunctionSamplesMap
Definition: SampleProf.h:741
bool callsiteIsHot(const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI, bool ProfAccForSymsInList)
Return true if the given callsite is hot wrt to hot cutoff threshold.
IntrusiveRefCntPtr< FileSystem > getRealFileSystem()
Gets an vfs::FileSystem for the 'real' file system, as seen by the operating system.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
static bool isIndirectCall(const MachineInstr &MI)
bool getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], uint32_t &ActualNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst which is annotated with value profile meta data.
Definition: InstrProf.cpp:1163
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1685
cl::opt< int > ProfileInlineLimitMin("sample-profile-inline-limit-min", cl::Hidden, cl::init(100), cl::desc("The lower bound of size growth limit for " "proirity-based sample profile loader inlining."))
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
cl::opt< int > ProfileInlineGrowthLimit("sample-profile-inline-growth-limit", cl::Hidden, cl::init(12), cl::desc("The size growth ratio limit for proirity-based sample profile " "loader inlining."))
scc_iterator< T > scc_begin(const T &G)
Construct the begin iterator for a deduced graph type T.
Definition: SCCIterator.h:233
void setProbeDistributionFactor(Instruction &Inst, float Factor)
Definition: PseudoProbe.cpp:76
std::string AnnotateInlinePassName(InlineContext IC)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:76
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
cl::opt< bool > SampleProfileUseProfi
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1118
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1652
llvm::cl::opt< bool > UseIterativeBFIInference
std::optional< PseudoProbe > extractProbe(const Instruction &Inst)
Definition: PseudoProbe.cpp:56
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void emitInlinedIntoBasedOnCost(OptimizationRemarkEmitter &ORE, DebugLoc DLoc, const BasicBlock *Block, const Function &Callee, const Function &Caller, const InlineCost &IC, bool ForProfileContext=false, const char *PassName=nullptr)
Emit ORE message based in cost (default heuristic).
cl::opt< bool > SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden, cl::desc("Sort profiled recursion by edge weights."))
std::unique_ptr< InlineAdvisor > getReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context, std::unique_ptr< InlineAdvisor > OriginalAdvisor, const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks, InlineContext IC)
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< int > ProfileInlineLimitMax("sample-profile-inline-limit-max", cl::Hidden, cl::init(10000), cl::desc("The upper bound of size growth limit for " "proirity-based sample profile loader inlining."))
cl::opt< int > SampleHotCallSiteThreshold("sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000), cl::desc("Hot callsite threshold for proirity-based sample profile loader " "inlining."))
void updateProfileCallee(Function *Callee, int64_t EntryDelta, const ValueMap< const Value *, WeakTrackingVH > *VMap=nullptr)
Updates profile information by adjusting the entry count by adding EntryDelta then scaling callsite i...
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1854
@ DS_Warning
cl::opt< bool > EnableExtTspBlockPlacement
const uint64_t NOMORE_ICP_MAGICNUM
Magic number in the value profile metadata showing a target has been promoted for the instruction and...
Definition: Metadata.h:56
cl::opt< int > SampleColdCallSiteThreshold("sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites"))
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
Used in the streaming interface as the general argument type.
A wrapper of binary function with basic blocks and jumps.
Provides context on when an inline advisor is constructed in the pipeline (e.g., link phase,...
Definition: InlineAdvisor.h:60
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
std::optional< bool > ComputeFullInlineCost
Compute inline cost even when the cost has exceeded the threshold.
Definition: InlineCost.h:232
Represents the relative location of an instruction.
Definition: SampleProf.h:289