LLVM 19.0.0git
SampleProfile.cpp
Go to the documentation of this file.
1//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SampleProfileLoader transformation. This pass
10// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12// profile information in the given profile.
13//
14// This pass generates branch weight annotations on the IR:
15//
16// - prof: Represents branch weights. This annotation is added to branches
17// to indicate the weights of each edge coming out of the branch.
18// The weight of each edge is the weight of the target block for
19// that edge. The weight of a block B is computed as the maximum
20// number of samples found in B.
21//
22//===----------------------------------------------------------------------===//
23
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
28#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/StringMap.h"
34#include "llvm/ADT/StringRef.h"
35#include "llvm/ADT/Twine.h"
46#include "llvm/IR/BasicBlock.h"
47#include "llvm/IR/DebugLoc.h"
49#include "llvm/IR/Function.h"
50#include "llvm/IR/GlobalValue.h"
51#include "llvm/IR/InstrTypes.h"
52#include "llvm/IR/Instruction.h"
55#include "llvm/IR/LLVMContext.h"
56#include "llvm/IR/MDBuilder.h"
57#include "llvm/IR/Module.h"
58#include "llvm/IR/PassManager.h"
60#include "llvm/IR/PseudoProbe.h"
67#include "llvm/Support/Debug.h"
71#include "llvm/Transforms/IPO.h"
81#include <algorithm>
82#include <cassert>
83#include <cstdint>
84#include <functional>
85#include <limits>
86#include <map>
87#include <memory>
88#include <queue>
89#include <string>
90#include <system_error>
91#include <utility>
92#include <vector>
93
94using namespace llvm;
95using namespace sampleprof;
96using namespace llvm::sampleprofutil;
98#define DEBUG_TYPE "sample-profile"
99#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
100
101STATISTIC(NumCSInlined,
102 "Number of functions inlined with context sensitive profile");
103STATISTIC(NumCSNotInlined,
104 "Number of functions not inlined with context sensitive profile");
105STATISTIC(NumMismatchedProfile,
106 "Number of functions with CFG mismatched profile");
107STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
108STATISTIC(NumDuplicatedInlinesite,
109 "Number of inlined callsites with a partial distribution factor");
110
111STATISTIC(NumCSInlinedHitMinLimit,
112 "Number of functions with FDO inline stopped due to min size limit");
113STATISTIC(NumCSInlinedHitMaxLimit,
114 "Number of functions with FDO inline stopped due to max size limit");
116 NumCSInlinedHitGrowthLimit,
117 "Number of functions with FDO inline stopped due to growth size limit");
118
119// Command line option to specify the file to read samples from. This is
120// mainly used for debugging.
122 "sample-profile-file", cl::init(""), cl::value_desc("filename"),
123 cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
124
125// The named file contains a set of transformations that may have been applied
126// to the symbol names between the program from which the sample data was
127// collected and the current program's symbols.
129 "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
130 cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
131
133 "salvage-stale-profile", cl::Hidden, cl::init(false),
134 cl::desc("Salvage stale profile by fuzzy matching and use the remapped "
135 "location for sample profile query."));
136
138 "report-profile-staleness", cl::Hidden, cl::init(false),
139 cl::desc("Compute and report stale profile statistical metrics."));
140
142 "persist-profile-staleness", cl::Hidden, cl::init(false),
143 cl::desc("Compute stale profile statistical metrics and write it into the "
144 "native object file(.llvm_stats section)."));
145
147 "profile-sample-accurate", cl::Hidden, cl::init(false),
148 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
149 "callsite and function as having 0 samples. Otherwise, treat "
150 "un-sampled callsites and functions conservatively as unknown. "));
151
153 "profile-sample-block-accurate", cl::Hidden, cl::init(false),
154 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
155 "branches and calls as having 0 samples. Otherwise, treat "
156 "them conservatively as unknown. "));
157
159 "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
160 cl::desc("For symbols in profile symbol list, regard their profiles to "
161 "be accurate. It may be overriden by profile-sample-accurate. "));
162
164 "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
165 cl::desc("Merge past inlinee's profile to outline version if sample "
166 "profile loader decided not to inline a call site. It will "
167 "only be enabled when top-down order of profile loading is "
168 "enabled. "));
169
171 "sample-profile-top-down-load", cl::Hidden, cl::init(true),
172 cl::desc("Do profile annotation and inlining for functions in top-down "
173 "order of call graph during sample profile loading. It only "
174 "works for new pass manager. "));
175
176static cl::opt<bool>
177 UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
178 cl::desc("Process functions in a top-down order "
179 "defined by the profiled call graph when "
180 "-sample-profile-top-down-load is on."));
181
183 "sample-profile-inline-size", cl::Hidden, cl::init(false),
184 cl::desc("Inline cold call sites in profile loader if it's beneficial "
185 "for code size."));
186
187// Since profiles are consumed by many passes, turning on this option has
188// side effects. For instance, pre-link SCC inliner would see merged profiles
189// and inline the hot functions (that are skipped in this pass).
191 "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
192 cl::desc("If true, artifically skip inline transformation in sample-loader "
193 "pass, and merge (or scale) profiles (as configured by "
194 "--sample-profile-merge-inlinee)."));
195
196namespace llvm {
198 SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
199 cl::desc("Sort profiled recursion by edge weights."));
200
202 "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
203 cl::desc("The size growth ratio limit for proirity-based sample profile "
204 "loader inlining."));
205
207 "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
208 cl::desc("The lower bound of size growth limit for "
209 "proirity-based sample profile loader inlining."));
210
212 "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
213 cl::desc("The upper bound of size growth limit for "
214 "proirity-based sample profile loader inlining."));
215
217 "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
218 cl::desc("Hot callsite threshold for proirity-based sample profile loader "
219 "inlining."));
220
222 "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
223 cl::desc("Threshold for inlining cold callsites"));
224} // namespace llvm
225
227 "sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25),
228 cl::desc(
229 "Relative hotness percentage threshold for indirect "
230 "call promotion in proirity-based sample profile loader inlining."));
231
233 "sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1),
234 cl::desc(
235 "Skip relative hotness check for ICP up to given number of targets."));
236
238 "sample-profile-prioritized-inline", cl::Hidden,
239
240 cl::desc("Use call site prioritized inlining for sample profile loader."
241 "Currently only CSSPGO is supported."));
242
244 "sample-profile-use-preinliner", cl::Hidden,
245
246 cl::desc("Use the preinliner decisions stored in profile context."));
247
249 "sample-profile-recursive-inline", cl::Hidden,
250
251 cl::desc("Allow sample loader inliner to inline recursive calls."));
252
254 "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
255 cl::desc(
256 "Optimization remarks file containing inline remarks to be replayed "
257 "by inlining from sample profile loader."),
258 cl::Hidden);
259
261 "sample-profile-inline-replay-scope",
262 cl::init(ReplayInlinerSettings::Scope::Function),
263 cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
264 "Replay on functions that have remarks associated "
265 "with them (default)"),
266 clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
267 "Replay on the entire module")),
268 cl::desc("Whether inline replay should be applied to the entire "
269 "Module or just the Functions (default) that are present as "
270 "callers in remarks during sample profile inlining."),
271 cl::Hidden);
272
274 "sample-profile-inline-replay-fallback",
275 cl::init(ReplayInlinerSettings::Fallback::Original),
278 ReplayInlinerSettings::Fallback::Original, "Original",
279 "All decisions not in replay send to original advisor (default)"),
280 clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
281 "AlwaysInline", "All decisions not in replay are inlined"),
282 clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
283 "All decisions not in replay are not inlined")),
284 cl::desc("How sample profile inline replay treats sites that don't come "
285 "from the replay. Original: defers to original advisor, "
286 "AlwaysInline: inline all sites not in replay, NeverInline: "
287 "inline no sites not in replay"),
288 cl::Hidden);
289
291 "sample-profile-inline-replay-format",
292 cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
294 clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
295 clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
296 "<Line Number>:<Column Number>"),
297 clEnumValN(CallSiteFormat::Format::LineDiscriminator,
298 "LineDiscriminator", "<Line Number>.<Discriminator>"),
299 clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
300 "LineColumnDiscriminator",
301 "<Line Number>:<Column Number>.<Discriminator> (default)")),
302 cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
303
305 MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
306 cl::desc("Max number of promotions for a single indirect "
307 "call callsite in sample profile loader"));
308
310 "overwrite-existing-weights", cl::Hidden, cl::init(false),
311 cl::desc("Ignore existing branch weights on IR and always overwrite."));
312
314 "annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),
315 cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "
316 "sample-profile inline pass name."));
317
318namespace llvm {
320}
321
322namespace {
323
324using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
325using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
326using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
327using EdgeWeightMap = DenseMap<Edge, uint64_t>;
328using BlockEdgeMap =
330
331class GUIDToFuncNameMapper {
332public:
333 GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
334 DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
335 : CurrentReader(Reader), CurrentModule(M),
336 CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
337 if (!CurrentReader.useMD5())
338 return;
339
340 for (const auto &F : CurrentModule) {
341 StringRef OrigName = F.getName();
342 CurrentGUIDToFuncNameMap.insert(
343 {Function::getGUID(OrigName), OrigName});
344
345 // Local to global var promotion used by optimization like thinlto
346 // will rename the var and add suffix like ".llvm.xxx" to the
347 // original local name. In sample profile, the suffixes of function
348 // names are all stripped. Since it is possible that the mapper is
349 // built in post-thin-link phase and var promotion has been done,
350 // we need to add the substring of function name without the suffix
351 // into the GUIDToFuncNameMap.
353 if (CanonName != OrigName)
354 CurrentGUIDToFuncNameMap.insert(
355 {Function::getGUID(CanonName), CanonName});
356 }
357
358 // Update GUIDToFuncNameMap for each function including inlinees.
359 SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
360 }
361
362 ~GUIDToFuncNameMapper() {
363 if (!CurrentReader.useMD5())
364 return;
365
366 CurrentGUIDToFuncNameMap.clear();
367
368 // Reset GUIDToFuncNameMap for of each function as they're no
369 // longer valid at this point.
370 SetGUIDToFuncNameMapForAll(nullptr);
371 }
372
373private:
374 void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
375 std::queue<FunctionSamples *> FSToUpdate;
376 for (auto &IFS : CurrentReader.getProfiles()) {
377 FSToUpdate.push(&IFS.second);
378 }
379
380 while (!FSToUpdate.empty()) {
381 FunctionSamples *FS = FSToUpdate.front();
382 FSToUpdate.pop();
383 FS->GUIDToFuncNameMap = Map;
384 for (const auto &ICS : FS->getCallsiteSamples()) {
385 const FunctionSamplesMap &FSMap = ICS.second;
386 for (const auto &IFS : FSMap) {
387 FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
388 FSToUpdate.push(&FS);
389 }
390 }
391 }
392 }
393
395 Module &CurrentModule;
396 DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
397};
398
399// Inline candidate used by iterative callsite prioritized inliner
400struct InlineCandidate {
401 CallBase *CallInstr;
402 const FunctionSamples *CalleeSamples;
403 // Prorated callsite count, which will be used to guide inlining. For example,
404 // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
405 // copies will get their own distribution factors and their prorated counts
406 // will be used to decide if they should be inlined independently.
407 uint64_t CallsiteCount;
408 // Call site distribution factor to prorate the profile samples for a
409 // duplicated callsite. Default value is 1.0.
410 float CallsiteDistribution;
411};
412
413// Inline candidate comparer using call site weight
414struct CandidateComparer {
415 bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
416 if (LHS.CallsiteCount != RHS.CallsiteCount)
417 return LHS.CallsiteCount < RHS.CallsiteCount;
418
419 const FunctionSamples *LCS = LHS.CalleeSamples;
420 const FunctionSamples *RCS = RHS.CalleeSamples;
421 assert(LCS && RCS && "Expect non-null FunctionSamples");
422
423 // Tie breaker using number of samples try to favor smaller functions first
424 if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
425 return LCS->getBodySamples().size() > RCS->getBodySamples().size();
426
427 // Tie breaker using GUID so we have stable/deterministic inlining order
428 return LCS->getGUID() < RCS->getGUID();
429 }
430};
431
432using CandidateQueue =
434 CandidateComparer>;
435
436// Sample profile matching - fuzzy match.
437class SampleProfileMatcher {
438 Module &M;
439 SampleProfileReader &Reader;
440 const PseudoProbeManager *ProbeManager;
441 SampleProfileMap FlattenedProfiles;
442 // For each function, the matcher generates a map, of which each entry is a
443 // mapping from the source location of current build to the source location in
444 // the profile.
445 StringMap<LocToLocMap> FuncMappings;
446
447 // Match state for an anchor/callsite.
448 enum class MatchState {
449 Unknown = 0,
450 // Initial match between input profile and current IR.
451 InitialMatch = 1,
452 // Initial mismatch between input profile and current IR.
453 InitialMismatch = 2,
454 // InitialMatch stays matched after fuzzy profile matching.
455 UnchangedMatch = 3,
456 // InitialMismatch stays mismatched after fuzzy profile matching.
457 UnchangedMismatch = 4,
458 // InitialMismatch is recovered after fuzzy profile matching.
459 RecoveredMismatch = 5,
460 // InitialMatch is removed and becomes mismatched after fuzzy profile
461 // matching.
462 RemovedMatch = 6,
463 };
464
465 // For each function, store every callsite and its matching state into this
466 // map, of which each entry is a pair of callsite location and MatchState.
467 // This is used for profile staleness computation and report.
469 FuncCallsiteMatchStates;
470
471 // Profile mismatch statstics:
472 uint64_t TotalProfiledFunc = 0;
473 // Num of checksum-mismatched function.
474 uint64_t NumStaleProfileFunc = 0;
475 uint64_t TotalProfiledCallsites = 0;
476 uint64_t NumMismatchedCallsites = 0;
477 uint64_t NumRecoveredCallsites = 0;
478 // Total samples for all profiled functions.
479 uint64_t TotalFunctionSamples = 0;
480 // Total samples for all checksum-mismatched functions.
481 uint64_t MismatchedFunctionSamples = 0;
482 uint64_t MismatchedCallsiteSamples = 0;
483 uint64_t RecoveredCallsiteSamples = 0;
484
485 // A dummy name for unknown indirect callee, used to differentiate from a
486 // non-call instruction that also has an empty callee name.
487 static constexpr const char *UnknownIndirectCallee =
488 "unknown.indirect.callee";
489
490public:
491 SampleProfileMatcher(Module &M, SampleProfileReader &Reader,
492 const PseudoProbeManager *ProbeManager)
493 : M(M), Reader(Reader), ProbeManager(ProbeManager){};
494 void runOnModule();
495 void clearMatchingData() {
496 // Do not clear FuncMappings, it stores IRLoc to ProfLoc remappings which
497 // will be used for sample loader.
498 FuncCallsiteMatchStates.clear();
499 }
500
501private:
502 FunctionSamples *getFlattenedSamplesFor(const Function &F) {
504 auto It = FlattenedProfiles.find(FunctionId(CanonFName));
505 if (It != FlattenedProfiles.end())
506 return &It->second;
507 return nullptr;
508 }
509 void runOnFunction(const Function &F);
510 void findIRAnchors(const Function &F,
511 std::map<LineLocation, StringRef> &IRAnchors);
512 void findProfileAnchors(
513 const FunctionSamples &FS,
514 std::map<LineLocation, std::unordered_set<FunctionId>> &ProfileAnchors);
515 // Record the callsite match states for profile staleness report, the result
516 // is saved in FuncCallsiteMatchStates.
517 void recordCallsiteMatchStates(
518 const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
519 const std::map<LineLocation, std::unordered_set<FunctionId>>
520 &ProfileAnchors,
521 const LocToLocMap *IRToProfileLocationMap);
522
523 bool isMismatchState(const enum MatchState &State) {
524 return State == MatchState::InitialMismatch ||
525 State == MatchState::UnchangedMismatch ||
526 State == MatchState::RemovedMatch;
527 };
528
529 bool isInitialState(const enum MatchState &State) {
530 return State == MatchState::InitialMatch ||
531 State == MatchState::InitialMismatch;
532 };
533
534 bool isFinalState(const enum MatchState &State) {
535 return State == MatchState::UnchangedMatch ||
536 State == MatchState::UnchangedMismatch ||
537 State == MatchState::RecoveredMismatch ||
538 State == MatchState::RemovedMatch;
539 };
540
541 // Count the samples of checksum mismatched function for the top-level
542 // function and all inlinees.
543 void countMismatchedFuncSamples(const FunctionSamples &FS, bool IsTopLevel);
544 // Count the number of mismatched or recovered callsites.
545 void countMismatchCallsites(const FunctionSamples &FS);
546 // Count the samples of mismatched or recovered callsites for top-level
547 // function and all inlinees.
548 void countMismatchedCallsiteSamples(const FunctionSamples &FS);
549 void computeAndReportProfileStaleness();
550
551 LocToLocMap &getIRToProfileLocationMap(const Function &F) {
552 auto Ret = FuncMappings.try_emplace(
554 return Ret.first->second;
555 }
556 void distributeIRToProfileLocationMap();
557 void distributeIRToProfileLocationMap(FunctionSamples &FS);
558 void runStaleProfileMatching(
559 const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
560 const std::map<LineLocation, std::unordered_set<FunctionId>>
561 &ProfileAnchors,
562 LocToLocMap &IRToProfileLocationMap);
563 void reportOrPersistProfileStats();
564};
565
566/// Sample profile pass.
567///
568/// This pass reads profile data from the file specified by
569/// -sample-profile-file and annotates every affected function with the
570/// profile information found in that file.
571class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> {
572public:
573 SampleProfileLoader(
574 StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
576 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
577 std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
578 std::function<const TargetLibraryInfo &(Function &)> GetTLI)
580 std::move(FS)),
581 GetAC(std::move(GetAssumptionCache)),
582 GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
583 LTOPhase(LTOPhase),
584 AnnotatedPassName(AnnotateSampleProfileInlinePhase
587 : CSINLINE_DEBUG) {}
588
589 bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
590 bool runOnModule(Module &M, ModuleAnalysisManager *AM,
592
593protected:
595 bool emitAnnotations(Function &F);
597 const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
598 const FunctionSamples *
599 findFunctionSamples(const Instruction &I) const override;
600 std::vector<const FunctionSamples *>
601 findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
602 void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
603 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
604 uint64_t Threshold);
605 // Attempt to promote indirect call and also inline the promoted call
606 bool tryPromoteAndInlineCandidate(
607 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
608 uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
609
610 bool inlineHotFunctions(Function &F,
611 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
612 std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
613 bool getExternalInlineAdvisorShouldInline(CallBase &CB);
614 InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
615 bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
616 bool
617 tryInlineCandidate(InlineCandidate &Candidate,
618 SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
619 bool
620 inlineHotFunctionsWithPriority(Function &F,
621 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
622 // Inline cold/small functions in addition to hot ones
623 bool shouldInlineColdCallee(CallBase &CallInst);
624 void emitOptimizationRemarksForInlineCandidates(
625 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
626 bool Hot);
627 void promoteMergeNotInlinedContextSamples(
629 const Function &F);
630 std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG);
631 std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M);
632 void generateMDProfMetadata(Function &F);
633
634 /// Map from function name to Function *. Used to find the function from
635 /// the function name. If the function name contains suffix, additional
636 /// entry is added to map from the stripped name to the function if there
637 /// is one-to-one mapping.
639
640 std::function<AssumptionCache &(Function &)> GetAC;
641 std::function<TargetTransformInfo &(Function &)> GetTTI;
642 std::function<const TargetLibraryInfo &(Function &)> GetTLI;
643
644 /// Profile tracker for different context.
645 std::unique_ptr<SampleContextTracker> ContextTracker;
646
647 /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
648 ///
649 /// We need to know the LTO phase because for example in ThinLTOPrelink
650 /// phase, in annotation, we should not promote indirect calls. Instead,
651 /// we will mark GUIDs that needs to be annotated to the function.
652 const ThinOrFullLTOPhase LTOPhase;
653 const std::string AnnotatedPassName;
654
655 /// Profle Symbol list tells whether a function name appears in the binary
656 /// used to generate the current profile.
657 std::unique_ptr<ProfileSymbolList> PSL;
658
659 /// Total number of samples collected in this profile.
660 ///
661 /// This is the sum of all the samples collected in all the functions executed
662 /// at runtime.
663 uint64_t TotalCollectedSamples = 0;
664
665 // Information recorded when we declined to inline a call site
666 // because we have determined it is too cold is accumulated for
667 // each callee function. Initially this is just the entry count.
668 struct NotInlinedProfileInfo {
669 uint64_t entryCount;
670 };
672
673 // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
674 // all the function symbols defined or declared in current module.
675 DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
676
677 // All the Names used in FunctionSamples including outline function
678 // names, inline instance names and call target names.
679 StringSet<> NamesInProfile;
680 // MD5 version of NamesInProfile. Either NamesInProfile or GUIDsInProfile is
681 // populated, depends on whether the profile uses MD5. Because the name table
682 // generally contains several magnitude more entries than the number of
683 // functions, we do not want to convert all names from one form to another.
684 llvm::DenseSet<uint64_t> GUIDsInProfile;
685
686 // For symbol in profile symbol list, whether to regard their profiles
687 // to be accurate. It is mainly decided by existance of profile symbol
688 // list and -profile-accurate-for-symsinlist flag, but it can be
689 // overriden by -profile-sample-accurate or profile-sample-accurate
690 // attribute.
691 bool ProfAccForSymsInList;
692
693 // External inline advisor used to replay inline decision from remarks.
694 std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
695
696 // A helper to implement the sample profile matching algorithm.
697 std::unique_ptr<SampleProfileMatcher> MatchingManager;
698
699private:
700 const char *getAnnotatedRemarkPassName() const {
701 return AnnotatedPassName.c_str();
702 }
703};
704} // end anonymous namespace
705
706namespace llvm {
707template <>
709 return succ_empty(BB);
710}
711
712template <>
714 const std::vector<const BasicBlockT *> &BasicBlocks,
715 BlockEdgeMap &Successors, FlowFunction &Func) {
716 for (auto &Jump : Func.Jumps) {
717 const auto *BB = BasicBlocks[Jump.Source];
718 const auto *Succ = BasicBlocks[Jump.Target];
719 const Instruction *TI = BB->getTerminator();
720 // Check if a block ends with InvokeInst and mark non-taken branch unlikely.
721 // In that case block Succ should be a landing pad
722 if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) {
723 if (isa<InvokeInst>(TI)) {
724 Jump.IsUnlikely = true;
725 }
726 }
727 const Instruction *SuccTI = Succ->getTerminator();
728 // Check if the target block contains UnreachableInst and mark it unlikely
729 if (SuccTI->getNumSuccessors() == 0) {
730 if (isa<UnreachableInst>(SuccTI)) {
731 Jump.IsUnlikely = true;
732 }
733 }
734 }
735}
736
737template <>
739 Function &F) {
740 DT.reset(new DominatorTree);
741 DT->recalculate(F);
742
743 PDT.reset(new PostDominatorTree(F));
744
745 LI.reset(new LoopInfo);
746 LI->analyze(*DT);
747}
748} // namespace llvm
749
750static bool skipProfileForFunction(const Function &F) {
751 return F.isDeclaration() || !F.hasFnAttribute("use-sample-profile");
752}
753
754ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
756 return getProbeWeight(Inst);
757
758 const DebugLoc &DLoc = Inst.getDebugLoc();
759 if (!DLoc)
760 return std::error_code();
761
762 // Ignore all intrinsics, phinodes and branch instructions.
763 // Branch and phinodes instruction usually contains debug info from sources
764 // outside of the residing basic block, thus we ignore them during annotation.
765 if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
766 return std::error_code();
767
768 // For non-CS profile, if a direct call/invoke instruction is inlined in
769 // profile (findCalleeFunctionSamples returns non-empty result), but not
770 // inlined here, it means that the inlined callsite has no sample, thus the
771 // call instruction should have 0 count.
772 // For CS profile, the callsite count of previously inlined callees is
773 // populated with the entry count of the callees.
775 if (const auto *CB = dyn_cast<CallBase>(&Inst))
776 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
777 return 0;
778
779 return getInstWeightImpl(Inst);
780}
781
782/// Get the FunctionSamples for a call instruction.
783///
784/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
785/// instance in which that call instruction is calling to. It contains
786/// all samples that resides in the inlined instance. We first find the
787/// inlined instance in which the call instruction is from, then we
788/// traverse its children to find the callsite with the matching
789/// location.
790///
791/// \param Inst Call/Invoke instruction to query.
792///
793/// \returns The FunctionSamples pointer to the inlined instance.
794const FunctionSamples *
795SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
796 const DILocation *DIL = Inst.getDebugLoc();
797 if (!DIL) {
798 return nullptr;
799 }
800
801 StringRef CalleeName;
802 if (Function *Callee = Inst.getCalledFunction())
803 CalleeName = Callee->getName();
804
806 return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
807
808 const FunctionSamples *FS = findFunctionSamples(Inst);
809 if (FS == nullptr)
810 return nullptr;
811
812 return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
813 CalleeName, Reader->getRemapper());
814}
815
816/// Returns a vector of FunctionSamples that are the indirect call targets
817/// of \p Inst. The vector is sorted by the total number of samples. Stores
818/// the total call count of the indirect call in \p Sum.
819std::vector<const FunctionSamples *>
820SampleProfileLoader::findIndirectCallFunctionSamples(
821 const Instruction &Inst, uint64_t &Sum) const {
822 const DILocation *DIL = Inst.getDebugLoc();
823 std::vector<const FunctionSamples *> R;
824
825 if (!DIL) {
826 return R;
827 }
828
829 auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
830 assert(L && R && "Expect non-null FunctionSamples");
831 if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())
832 return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();
833 return L->getGUID() < R->getGUID();
834 };
835
837 auto CalleeSamples =
838 ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
839 if (CalleeSamples.empty())
840 return R;
841
842 // For CSSPGO, we only use target context profile's entry count
843 // as that already includes both inlined callee and non-inlined ones..
844 Sum = 0;
845 for (const auto *const FS : CalleeSamples) {
846 Sum += FS->getHeadSamplesEstimate();
847 R.push_back(FS);
848 }
849 llvm::sort(R, FSCompare);
850 return R;
851 }
852
853 const FunctionSamples *FS = findFunctionSamples(Inst);
854 if (FS == nullptr)
855 return R;
856
858 Sum = 0;
859 if (auto T = FS->findCallTargetMapAt(CallSite))
860 for (const auto &T_C : *T)
861 Sum += T_C.second;
862 if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
863 if (M->empty())
864 return R;
865 for (const auto &NameFS : *M) {
866 Sum += NameFS.second.getHeadSamplesEstimate();
867 R.push_back(&NameFS.second);
868 }
869 llvm::sort(R, FSCompare);
870 }
871 return R;
872}
873
874const FunctionSamples *
875SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
877 std::optional<PseudoProbe> Probe = extractProbe(Inst);
878 if (!Probe)
879 return nullptr;
880 }
881
882 const DILocation *DIL = Inst.getDebugLoc();
883 if (!DIL)
884 return Samples;
885
886 auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
887 if (it.second) {
889 it.first->second = ContextTracker->getContextSamplesFor(DIL);
890 else
891 it.first->second =
892 Samples->findFunctionSamples(DIL, Reader->getRemapper());
893 }
894 return it.first->second;
895}
896
897/// Check whether the indirect call promotion history of \p Inst allows
898/// the promotion for \p Candidate.
899/// If the profile count for the promotion candidate \p Candidate is
900/// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
901/// for \p Inst. If we already have at least MaxNumPromotions
902/// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
903/// cannot promote for \p Inst anymore.
904static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
905 uint32_t NumVals = 0;
906 uint64_t TotalCount = 0;
907 std::unique_ptr<InstrProfValueData[]> ValueData =
908 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
909 bool Valid =
910 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
911 ValueData.get(), NumVals, TotalCount, true);
912 // No valid value profile so no promoted targets have been recorded
913 // before. Ok to do ICP.
914 if (!Valid)
915 return true;
916
917 unsigned NumPromoted = 0;
918 for (uint32_t I = 0; I < NumVals; I++) {
919 if (ValueData[I].Count != NOMORE_ICP_MAGICNUM)
920 continue;
921
922 // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
923 // metadata, it means the candidate has been promoted for this
924 // indirect call.
925 if (ValueData[I].Value == Function::getGUID(Candidate))
926 return false;
927 NumPromoted++;
928 // If already have MaxNumPromotions promotion, don't do it anymore.
929 if (NumPromoted == MaxNumPromotions)
930 return false;
931 }
932 return true;
933}
934
935/// Update indirect call target profile metadata for \p Inst.
936/// Usually \p Sum is the sum of counts of all the targets for \p Inst.
937/// If it is 0, it means updateIDTMetaData is used to mark a
938/// certain target to be promoted already. If it is not zero,
939/// we expect to use it to update the total count in the value profile.
940static void
942 const SmallVectorImpl<InstrProfValueData> &CallTargets,
943 uint64_t Sum) {
944 // Bail out early if MaxNumPromotions is zero.
945 // This prevents allocating an array of zero length below.
946 //
947 // Note `updateIDTMetaData` is called in two places so check
948 // `MaxNumPromotions` inside it.
949 if (MaxNumPromotions == 0)
950 return;
951 uint32_t NumVals = 0;
952 // OldSum is the existing total count in the value profile data.
953 uint64_t OldSum = 0;
954 std::unique_ptr<InstrProfValueData[]> ValueData =
955 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
956 bool Valid =
957 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
958 ValueData.get(), NumVals, OldSum, true);
959
960 DenseMap<uint64_t, uint64_t> ValueCountMap;
961 if (Sum == 0) {
962 assert((CallTargets.size() == 1 &&
963 CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&
964 "If sum is 0, assume only one element in CallTargets "
965 "with count being NOMORE_ICP_MAGICNUM");
966 // Initialize ValueCountMap with existing value profile data.
967 if (Valid) {
968 for (uint32_t I = 0; I < NumVals; I++)
969 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
970 }
971 auto Pair =
972 ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
973 // If the target already exists in value profile, decrease the total
974 // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.
975 if (!Pair.second) {
976 OldSum -= Pair.first->second;
977 Pair.first->second = NOMORE_ICP_MAGICNUM;
978 }
979 Sum = OldSum;
980 } else {
981 // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
982 // counts in the value profile.
983 if (Valid) {
984 for (uint32_t I = 0; I < NumVals; I++) {
985 if (ValueData[I].Count == NOMORE_ICP_MAGICNUM)
986 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
987 }
988 }
989
990 for (const auto &Data : CallTargets) {
991 auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
992 if (Pair.second)
993 continue;
994 // The target represented by Data.Value has already been promoted.
995 // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease
996 // Sum by Data.Count.
997 assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
998 Sum -= Data.Count;
999 }
1000 }
1001
1003 for (const auto &ValueCount : ValueCountMap) {
1004 NewCallTargets.emplace_back(
1005 InstrProfValueData{ValueCount.first, ValueCount.second});
1006 }
1007
1008 llvm::sort(NewCallTargets,
1009 [](const InstrProfValueData &L, const InstrProfValueData &R) {
1010 if (L.Count != R.Count)
1011 return L.Count > R.Count;
1012 return L.Value > R.Value;
1013 });
1014
1015 uint32_t MaxMDCount =
1016 std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
1017 annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst,
1018 NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);
1019}
1020
1021/// Attempt to promote indirect call and also inline the promoted call.
1022///
1023/// \param F Caller function.
1024/// \param Candidate ICP and inline candidate.
1025/// \param SumOrigin Original sum of target counts for indirect call before
1026/// promoting given candidate.
1027/// \param Sum Prorated sum of remaining target counts for indirect call
1028/// after promoting given candidate.
1029/// \param InlinedCallSite Output vector for new call sites exposed after
1030/// inlining.
1031bool SampleProfileLoader::tryPromoteAndInlineCandidate(
1032 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
1033 SmallVector<CallBase *, 8> *InlinedCallSite) {
1034 // Bail out early if sample-loader inliner is disabled.
1036 return false;
1037
1038 // Bail out early if MaxNumPromotions is zero.
1039 // This prevents allocating an array of zero length in callees below.
1040 if (MaxNumPromotions == 0)
1041 return false;
1042 auto CalleeFunctionName = Candidate.CalleeSamples->getFunction();
1043 auto R = SymbolMap.find(CalleeFunctionName);
1044 if (R == SymbolMap.end() || !R->second)
1045 return false;
1046
1047 auto &CI = *Candidate.CallInstr;
1048 if (!doesHistoryAllowICP(CI, R->second->getName()))
1049 return false;
1050
1051 const char *Reason = "Callee function not available";
1052 // R->getValue() != &F is to prevent promoting a recursive call.
1053 // If it is a recursive call, we do not inline it as it could bloat
1054 // the code exponentially. There is way to better handle this, e.g.
1055 // clone the caller first, and inline the cloned caller if it is
1056 // recursive. As llvm does not inline recursive calls, we will
1057 // simply ignore it instead of handling it explicitly.
1058 if (!R->second->isDeclaration() && R->second->getSubprogram() &&
1059 R->second->hasFnAttribute("use-sample-profile") &&
1060 R->second != &F && isLegalToPromote(CI, R->second, &Reason)) {
1061 // For promoted target, set its value with NOMORE_ICP_MAGICNUM count
1062 // in the value profile metadata so the target won't be promoted again.
1063 SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
1064 Function::getGUID(R->second->getName()), NOMORE_ICP_MAGICNUM}};
1065 updateIDTMetaData(CI, SortedCallTargets, 0);
1066
1067 auto *DI = &pgo::promoteIndirectCall(
1068 CI, R->second, Candidate.CallsiteCount, Sum, false, ORE);
1069 if (DI) {
1070 Sum -= Candidate.CallsiteCount;
1071 // Do not prorate the indirect callsite distribution since the original
1072 // distribution will be used to scale down non-promoted profile target
1073 // counts later. By doing this we lose track of the real callsite count
1074 // for the leftover indirect callsite as a trade off for accurate call
1075 // target counts.
1076 // TODO: Ideally we would have two separate factors, one for call site
1077 // counts and one is used to prorate call target counts.
1078 // Do not update the promoted direct callsite distribution at this
1079 // point since the original distribution combined with the callee profile
1080 // will be used to prorate callsites from the callee if inlined. Once not
1081 // inlined, the direct callsite distribution should be prorated so that
1082 // the it will reflect the real callsite counts.
1083 Candidate.CallInstr = DI;
1084 if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
1085 bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
1086 if (!Inlined) {
1087 // Prorate the direct callsite distribution so that it reflects real
1088 // callsite counts.
1090 *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
1091 }
1092 return Inlined;
1093 }
1094 }
1095 } else {
1096 LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
1098 Candidate.CallInstr->getName())<< " because "
1099 << Reason << "\n");
1100 }
1101 return false;
1102}
1103
1104bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
1105 if (!ProfileSizeInline)
1106 return false;
1107
1109 if (Callee == nullptr)
1110 return false;
1111
1113 GetAC, GetTLI);
1114
1115 if (Cost.isNever())
1116 return false;
1117
1118 if (Cost.isAlways())
1119 return true;
1120
1121 return Cost.getCost() <= SampleColdCallSiteThreshold;
1122}
1123
1124void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
1125 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
1126 bool Hot) {
1127 for (auto *I : Candidates) {
1128 Function *CalledFunction = I->getCalledFunction();
1129 if (CalledFunction) {
1130 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1131 "InlineAttempt", I->getDebugLoc(),
1132 I->getParent())
1133 << "previous inlining reattempted for "
1134 << (Hot ? "hotness: '" : "size: '")
1135 << ore::NV("Callee", CalledFunction) << "' into '"
1136 << ore::NV("Caller", &F) << "'");
1137 }
1138 }
1139}
1140
1141void SampleProfileLoader::findExternalInlineCandidate(
1142 CallBase *CB, const FunctionSamples *Samples,
1143 DenseSet<GlobalValue::GUID> &InlinedGUIDs, uint64_t Threshold) {
1144
1145 // If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external
1146 // function make sure it's imported
1147 if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
1148 // Samples may not exist for replayed function, if so
1149 // just add the direct GUID and move on
1150 if (!Samples) {
1151 InlinedGUIDs.insert(
1152 Function::getGUID(CB->getCalledFunction()->getName()));
1153 return;
1154 }
1155 // Otherwise, drop the threshold to import everything that we can
1156 Threshold = 0;
1157 }
1158
1159 // In some rare cases, call instruction could be changed after being pushed
1160 // into inline candidate queue, this is because earlier inlining may expose
1161 // constant propagation which can change indirect call to direct call. When
1162 // this happens, we may fail to find matching function samples for the
1163 // candidate later, even if a match was found when the candidate was enqueued.
1164 if (!Samples)
1165 return;
1166
1167 // For AutoFDO profile, retrieve candidate profiles by walking over
1168 // the nested inlinee profiles.
1170 Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
1171 return;
1172 }
1173
1174 ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);
1175 std::queue<ContextTrieNode *> CalleeList;
1176 CalleeList.push(Caller);
1177 while (!CalleeList.empty()) {
1178 ContextTrieNode *Node = CalleeList.front();
1179 CalleeList.pop();
1180 FunctionSamples *CalleeSample = Node->getFunctionSamples();
1181 // For CSSPGO profile, retrieve candidate profile by walking over the
1182 // trie built for context profile. Note that also take call targets
1183 // even if callee doesn't have a corresponding context profile.
1184 if (!CalleeSample)
1185 continue;
1186
1187 // If pre-inliner decision is used, honor that for importing as well.
1188 bool PreInline =
1191 if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)
1192 continue;
1193
1194 Function *Func = SymbolMap.lookup(CalleeSample->getFunction());
1195 // Add to the import list only when it's defined out of module.
1196 if (!Func || Func->isDeclaration())
1197 InlinedGUIDs.insert(CalleeSample->getGUID());
1198
1199 // Import hot CallTargets, which may not be available in IR because full
1200 // profile annotation cannot be done until backend compilation in ThinLTO.
1201 for (const auto &BS : CalleeSample->getBodySamples())
1202 for (const auto &TS : BS.second.getCallTargets())
1203 if (TS.second > Threshold) {
1204 const Function *Callee = SymbolMap.lookup(TS.first);
1205 if (!Callee || Callee->isDeclaration())
1206 InlinedGUIDs.insert(TS.first.getHashCode());
1207 }
1208
1209 // Import hot child context profile associted with callees. Note that this
1210 // may have some overlap with the call target loop above, but doing this
1211 // based child context profile again effectively allow us to use the max of
1212 // entry count and call target count to determine importing.
1213 for (auto &Child : Node->getAllChildContext()) {
1214 ContextTrieNode *CalleeNode = &Child.second;
1215 CalleeList.push(CalleeNode);
1216 }
1217 }
1218}
1219
1220/// Iteratively inline hot callsites of a function.
1221///
1222/// Iteratively traverse all callsites of the function \p F, so as to
1223/// find out callsites with corresponding inline instances.
1224///
1225/// For such callsites,
1226/// - If it is hot enough, inline the callsites and adds callsites of the callee
1227/// into the caller. If the call is an indirect call, first promote
1228/// it to direct call. Each indirect call is limited with a single target.
1229///
1230/// - If a callsite is not inlined, merge the its profile to the outline
1231/// version (if --sample-profile-merge-inlinee is true), or scale the
1232/// counters of standalone function based on the profile of inlined
1233/// instances (if --sample-profile-merge-inlinee is false).
1234///
1235/// Later passes may consume the updated profiles.
1236///
1237/// \param F function to perform iterative inlining.
1238/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
1239/// inlined in the profiled binary.
1240///
1241/// \returns True if there is any inline happened.
1242bool SampleProfileLoader::inlineHotFunctions(
1243 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1244 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1245 // Profile symbol list is ignored when profile-sample-accurate is on.
1246 assert((!ProfAccForSymsInList ||
1248 !F.hasFnAttribute("profile-sample-accurate"))) &&
1249 "ProfAccForSymsInList should be false when profile-sample-accurate "
1250 "is enabled");
1251
1252 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1253 bool Changed = false;
1254 bool LocalChanged = true;
1255 while (LocalChanged) {
1256 LocalChanged = false;
1258 for (auto &BB : F) {
1259 bool Hot = false;
1260 SmallVector<CallBase *, 10> AllCandidates;
1261 SmallVector<CallBase *, 10> ColdCandidates;
1262 for (auto &I : BB) {
1263 const FunctionSamples *FS = nullptr;
1264 if (auto *CB = dyn_cast<CallBase>(&I)) {
1265 if (!isa<IntrinsicInst>(I)) {
1266 if ((FS = findCalleeFunctionSamples(*CB))) {
1267 assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
1268 "GUIDToFuncNameMap has to be populated");
1269 AllCandidates.push_back(CB);
1270 if (FS->getHeadSamplesEstimate() > 0 ||
1272 LocalNotInlinedCallSites.insert({CB, FS});
1273 if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1274 Hot = true;
1275 else if (shouldInlineColdCallee(*CB))
1276 ColdCandidates.push_back(CB);
1277 } else if (getExternalInlineAdvisorShouldInline(*CB)) {
1278 AllCandidates.push_back(CB);
1279 }
1280 }
1281 }
1282 }
1283 if (Hot || ExternalInlineAdvisor) {
1284 CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
1285 emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
1286 } else {
1287 CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
1288 emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
1289 }
1290 }
1291 for (CallBase *I : CIS) {
1292 Function *CalledFunction = I->getCalledFunction();
1293 InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
1294 0 /* dummy count */,
1295 1.0 /* dummy distribution factor */};
1296 // Do not inline recursive calls.
1297 if (CalledFunction == &F)
1298 continue;
1299 if (I->isIndirectCall()) {
1300 uint64_t Sum;
1301 for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
1302 uint64_t SumOrigin = Sum;
1303 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1304 findExternalInlineCandidate(I, FS, InlinedGUIDs,
1305 PSI->getOrCompHotCountThreshold());
1306 continue;
1307 }
1308 if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1309 continue;
1310
1311 Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};
1312 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
1313 LocalNotInlinedCallSites.erase(I);
1314 LocalChanged = true;
1315 }
1316 }
1317 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1318 !CalledFunction->isDeclaration()) {
1319 if (tryInlineCandidate(Candidate)) {
1320 LocalNotInlinedCallSites.erase(I);
1321 LocalChanged = true;
1322 }
1323 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1324 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1325 InlinedGUIDs,
1326 PSI->getOrCompHotCountThreshold());
1327 }
1328 }
1329 Changed |= LocalChanged;
1330 }
1331
1332 // For CS profile, profile for not inlined context will be merged when
1333 // base profile is being retrieved.
1335 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1336 return Changed;
1337}
1338
1339bool SampleProfileLoader::tryInlineCandidate(
1340 InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
1341 // Do not attempt to inline a candidate if
1342 // --disable-sample-loader-inlining is true.
1344 return false;
1345
1346 CallBase &CB = *Candidate.CallInstr;
1347 Function *CalledFunction = CB.getCalledFunction();
1348 assert(CalledFunction && "Expect a callee with definition");
1349 DebugLoc DLoc = CB.getDebugLoc();
1350 BasicBlock *BB = CB.getParent();
1351
1352 InlineCost Cost = shouldInlineCandidate(Candidate);
1353 if (Cost.isNever()) {
1354 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1355 "InlineFail", DLoc, BB)
1356 << "incompatible inlining");
1357 return false;
1358 }
1359
1360 if (!Cost)
1361 return false;
1362
1363 InlineFunctionInfo IFI(GetAC);
1364 IFI.UpdateProfile = false;
1365 InlineResult IR = InlineFunction(CB, IFI,
1366 /*MergeAttributes=*/true);
1367 if (!IR.isSuccess())
1368 return false;
1369
1370 // The call to InlineFunction erases I, so we can't pass it here.
1371 emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),
1372 Cost, true, getAnnotatedRemarkPassName());
1373
1374 // Now populate the list of newly exposed call sites.
1375 if (InlinedCallSites) {
1376 InlinedCallSites->clear();
1377 for (auto &I : IFI.InlinedCallSites)
1378 InlinedCallSites->push_back(I);
1379 }
1380
1382 ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
1383 ++NumCSInlined;
1384
1385 // Prorate inlined probes for a duplicated inlining callsite which probably
1386 // has a distribution less than 100%. Samples for an inlinee should be
1387 // distributed among the copies of the original callsite based on each
1388 // callsite's distribution factor for counts accuracy. Note that an inlined
1389 // probe may come with its own distribution factor if it has been duplicated
1390 // in the inlinee body. The two factor are multiplied to reflect the
1391 // aggregation of duplication.
1392 if (Candidate.CallsiteDistribution < 1) {
1393 for (auto &I : IFI.InlinedCallSites) {
1394 if (std::optional<PseudoProbe> Probe = extractProbe(*I))
1395 setProbeDistributionFactor(*I, Probe->Factor *
1396 Candidate.CallsiteDistribution);
1397 }
1398 NumDuplicatedInlinesite++;
1399 }
1400
1401 return true;
1402}
1403
1404bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
1405 CallBase *CB) {
1406 assert(CB && "Expect non-null call instruction");
1407
1408 if (isa<IntrinsicInst>(CB))
1409 return false;
1410
1411 // Find the callee's profile. For indirect call, find hottest target profile.
1412 const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
1413 // If ExternalInlineAdvisor wants to inline this site, do so even
1414 // if Samples are not present.
1415 if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
1416 return false;
1417
1418 float Factor = 1.0;
1419 if (std::optional<PseudoProbe> Probe = extractProbe(*CB))
1420 Factor = Probe->Factor;
1421
1422 uint64_t CallsiteCount =
1423 CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;
1424 *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
1425 return true;
1426}
1427
1428std::optional<InlineCost>
1429SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
1430 std::unique_ptr<InlineAdvice> Advice = nullptr;
1431 if (ExternalInlineAdvisor) {
1432 Advice = ExternalInlineAdvisor->getAdvice(CB);
1433 if (Advice) {
1434 if (!Advice->isInliningRecommended()) {
1435 Advice->recordUnattemptedInlining();
1436 return InlineCost::getNever("not previously inlined");
1437 }
1438 Advice->recordInlining();
1439 return InlineCost::getAlways("previously inlined");
1440 }
1441 }
1442
1443 return {};
1444}
1445
1446bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
1447 std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
1448 return Cost ? !!*Cost : false;
1449}
1450
1452SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
1453 if (std::optional<InlineCost> ReplayCost =
1454 getExternalInlineAdvisorCost(*Candidate.CallInstr))
1455 return *ReplayCost;
1456 // Adjust threshold based on call site hotness, only do this for callsite
1457 // prioritized inliner because otherwise cost-benefit check is done earlier.
1458 int SampleThreshold = SampleColdCallSiteThreshold;
1460 if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
1461 SampleThreshold = SampleHotCallSiteThreshold;
1462 else if (!ProfileSizeInline)
1463 return InlineCost::getNever("cold callsite");
1464 }
1465
1466 Function *Callee = Candidate.CallInstr->getCalledFunction();
1467 assert(Callee && "Expect a definition for inline candidate of direct call");
1468
1469 InlineParams Params = getInlineParams();
1470 // We will ignore the threshold from inline cost, so always get full cost.
1471 Params.ComputeFullInlineCost = true;
1473 // Checks if there is anything in the reachable portion of the callee at
1474 // this callsite that makes this inlining potentially illegal. Need to
1475 // set ComputeFullInlineCost, otherwise getInlineCost may return early
1476 // when cost exceeds threshold without checking all IRs in the callee.
1477 // The acutal cost does not matter because we only checks isNever() to
1478 // see if it is legal to inline the callsite.
1479 InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
1480 GetTTI(*Callee), GetAC, GetTLI);
1481
1482 // Honor always inline and never inline from call analyzer
1483 if (Cost.isNever() || Cost.isAlways())
1484 return Cost;
1485
1486 // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
1487 // decisions based on hotness as well as accurate function byte sizes for
1488 // given context using function/inlinee sizes from previous build. It
1489 // stores the decision in profile, and also adjust/merge context profile
1490 // aiming at better context-sensitive post-inline profile quality, assuming
1491 // all inline decision estimates are going to be honored by compiler. Here
1492 // we replay that inline decision under `sample-profile-use-preinliner`.
1493 // Note that we don't need to handle negative decision from preinliner as
1494 // context profile for not inlined calls are merged by preinliner already.
1495 if (UsePreInlinerDecision && Candidate.CalleeSamples) {
1496 // Once two node are merged due to promotion, we're losing some context
1497 // so the original context-sensitive preinliner decision should be ignored
1498 // for SyntheticContext.
1499 SampleContext &Context = Candidate.CalleeSamples->getContext();
1500 if (!Context.hasState(SyntheticContext) &&
1501 Context.hasAttribute(ContextShouldBeInlined))
1502 return InlineCost::getAlways("preinliner");
1503 }
1504
1505 // For old FDO inliner, we inline the call site as long as cost is not
1506 // "Never". The cost-benefit check is done earlier.
1508 return InlineCost::get(Cost.getCost(), INT_MAX);
1509 }
1510
1511 // Otherwise only use the cost from call analyzer, but overwite threshold with
1512 // Sample PGO threshold.
1513 return InlineCost::get(Cost.getCost(), SampleThreshold);
1514}
1515
1516bool SampleProfileLoader::inlineHotFunctionsWithPriority(
1517 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1518 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1519 // Profile symbol list is ignored when profile-sample-accurate is on.
1520 assert((!ProfAccForSymsInList ||
1522 !F.hasFnAttribute("profile-sample-accurate"))) &&
1523 "ProfAccForSymsInList should be false when profile-sample-accurate "
1524 "is enabled");
1525
1526 // Populating worklist with initial call sites from root inliner, along
1527 // with call site weights.
1528 CandidateQueue CQueue;
1529 InlineCandidate NewCandidate;
1530 for (auto &BB : F) {
1531 for (auto &I : BB) {
1532 auto *CB = dyn_cast<CallBase>(&I);
1533 if (!CB)
1534 continue;
1535 if (getInlineCandidate(&NewCandidate, CB))
1536 CQueue.push(NewCandidate);
1537 }
1538 }
1539
1540 // Cap the size growth from profile guided inlining. This is needed even
1541 // though cost of each inline candidate already accounts for callee size,
1542 // because with top-down inlining, we can grow inliner size significantly
1543 // with large number of smaller inlinees each pass the cost check.
1545 "Max inline size limit should not be smaller than min inline size "
1546 "limit.");
1547 unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
1548 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
1549 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
1550 if (ExternalInlineAdvisor)
1551 SizeLimit = std::numeric_limits<unsigned>::max();
1552
1553 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1554
1555 // Perform iterative BFS call site prioritized inlining
1556 bool Changed = false;
1557 while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
1558 InlineCandidate Candidate = CQueue.top();
1559 CQueue.pop();
1560 CallBase *I = Candidate.CallInstr;
1561 Function *CalledFunction = I->getCalledFunction();
1562
1563 if (CalledFunction == &F)
1564 continue;
1565 if (I->isIndirectCall()) {
1566 uint64_t Sum = 0;
1567 auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
1568 uint64_t SumOrigin = Sum;
1569 Sum *= Candidate.CallsiteDistribution;
1570 unsigned ICPCount = 0;
1571 for (const auto *FS : CalleeSamples) {
1572 // TODO: Consider disable pre-lTO ICP for MonoLTO as well
1573 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1574 findExternalInlineCandidate(I, FS, InlinedGUIDs,
1575 PSI->getOrCompHotCountThreshold());
1576 continue;
1577 }
1578 uint64_t EntryCountDistributed =
1579 FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;
1580 // In addition to regular inline cost check, we also need to make sure
1581 // ICP isn't introducing excessive speculative checks even if individual
1582 // target looks beneficial to promote and inline. That means we should
1583 // only do ICP when there's a small number dominant targets.
1584 if (ICPCount >= ProfileICPRelativeHotnessSkip &&
1585 EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness)
1586 break;
1587 // TODO: Fix CallAnalyzer to handle all indirect calls.
1588 // For indirect call, we don't run CallAnalyzer to get InlineCost
1589 // before actual inlining. This is because we could see two different
1590 // types from the same definition, which makes CallAnalyzer choke as
1591 // it's expecting matching parameter type on both caller and callee
1592 // side. See example from PR18962 for the triggering cases (the bug was
1593 // fixed, but we generate different types).
1594 if (!PSI->isHotCount(EntryCountDistributed))
1595 break;
1596 SmallVector<CallBase *, 8> InlinedCallSites;
1597 // Attach function profile for promoted indirect callee, and update
1598 // call site count for the promoted inline candidate too.
1599 Candidate = {I, FS, EntryCountDistributed,
1600 Candidate.CallsiteDistribution};
1601 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
1602 &InlinedCallSites)) {
1603 for (auto *CB : InlinedCallSites) {
1604 if (getInlineCandidate(&NewCandidate, CB))
1605 CQueue.emplace(NewCandidate);
1606 }
1607 ICPCount++;
1608 Changed = true;
1609 } else if (!ContextTracker) {
1610 LocalNotInlinedCallSites.insert({I, FS});
1611 }
1612 }
1613 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1614 !CalledFunction->isDeclaration()) {
1615 SmallVector<CallBase *, 8> InlinedCallSites;
1616 if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
1617 for (auto *CB : InlinedCallSites) {
1618 if (getInlineCandidate(&NewCandidate, CB))
1619 CQueue.emplace(NewCandidate);
1620 }
1621 Changed = true;
1622 } else if (!ContextTracker) {
1623 LocalNotInlinedCallSites.insert({I, Candidate.CalleeSamples});
1624 }
1625 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1626 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1627 InlinedGUIDs,
1628 PSI->getOrCompHotCountThreshold());
1629 }
1630 }
1631
1632 if (!CQueue.empty()) {
1633 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
1634 ++NumCSInlinedHitMaxLimit;
1635 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
1636 ++NumCSInlinedHitMinLimit;
1637 else
1638 ++NumCSInlinedHitGrowthLimit;
1639 }
1640
1641 // For CS profile, profile for not inlined context will be merged when
1642 // base profile is being retrieved.
1644 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1645 return Changed;
1646}
1647
1648void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
1650 const Function &F) {
1651 // Accumulate not inlined callsite information into notInlinedSamples
1652 for (const auto &Pair : NonInlinedCallSites) {
1653 CallBase *I = Pair.first;
1654 Function *Callee = I->getCalledFunction();
1655 if (!Callee || Callee->isDeclaration())
1656 continue;
1657
1658 ORE->emit(
1659 OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",
1660 I->getDebugLoc(), I->getParent())
1661 << "previous inlining not repeated: '" << ore::NV("Callee", Callee)
1662 << "' into '" << ore::NV("Caller", &F) << "'");
1663
1664 ++NumCSNotInlined;
1665 const FunctionSamples *FS = Pair.second;
1666 if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {
1667 continue;
1668 }
1669
1670 // Do not merge a context that is already duplicated into the base profile.
1671 if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))
1672 continue;
1673
1674 if (ProfileMergeInlinee) {
1675 // A function call can be replicated by optimizations like callsite
1676 // splitting or jump threading and the replicates end up sharing the
1677 // sample nested callee profile instead of slicing the original
1678 // inlinee's profile. We want to do merge exactly once by filtering out
1679 // callee profiles with a non-zero head sample count.
1680 if (FS->getHeadSamples() == 0) {
1681 // Use entry samples as head samples during the merge, as inlinees
1682 // don't have head samples.
1683 const_cast<FunctionSamples *>(FS)->addHeadSamples(
1684 FS->getHeadSamplesEstimate());
1685
1686 // Note that we have to do the merge right after processing function.
1687 // This allows OutlineFS's profile to be used for annotation during
1688 // top-down processing of functions' annotation.
1689 FunctionSamples *OutlineFS = Reader->getSamplesFor(*Callee);
1690 // If outlined function does not exist in the profile, add it to a
1691 // separate map so that it does not rehash the original profile.
1692 if (!OutlineFS)
1693 OutlineFS = &OutlineFunctionSamples[
1695 OutlineFS->merge(*FS, 1);
1696 // Set outlined profile to be synthetic to not bias the inliner.
1697 OutlineFS->SetContextSynthetic();
1698 }
1699 } else {
1700 auto pair =
1701 notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
1702 pair.first->second.entryCount += FS->getHeadSamplesEstimate();
1703 }
1704 }
1705}
1706
1707/// Returns the sorted CallTargetMap \p M by count in descending order.
1711 for (const auto &I : SampleRecord::SortCallTargets(M)) {
1712 R.emplace_back(
1713 InstrProfValueData{I.first.getHashCode(), I.second});
1714 }
1715 return R;
1716}
1717
1718// Generate MD_prof metadata for every branch instruction using the
1719// edge weights computed during propagation.
1720void SampleProfileLoader::generateMDProfMetadata(Function &F) {
1721 // Generate MD_prof metadata for every branch instruction using the
1722 // edge weights computed during propagation.
1723 LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1724 LLVMContext &Ctx = F.getContext();
1725 MDBuilder MDB(Ctx);
1726 for (auto &BI : F) {
1727 BasicBlock *BB = &BI;
1728
1729 if (BlockWeights[BB]) {
1730 for (auto &I : *BB) {
1731 if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1732 continue;
1733 if (!cast<CallBase>(I).getCalledFunction()) {
1734 const DebugLoc &DLoc = I.getDebugLoc();
1735 if (!DLoc)
1736 continue;
1737 const DILocation *DIL = DLoc;
1738 const FunctionSamples *FS = findFunctionSamples(I);
1739 if (!FS)
1740 continue;
1743 FS->findCallTargetMapAt(CallSite);
1744 if (!T || T.get().empty())
1745 continue;
1747 // Prorate the callsite counts based on the pre-ICP distribution
1748 // factor to reflect what is already done to the callsite before
1749 // ICP, such as calliste cloning.
1750 if (std::optional<PseudoProbe> Probe = extractProbe(I)) {
1751 if (Probe->Factor < 1)
1752 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
1753 }
1754 }
1755 SmallVector<InstrProfValueData, 2> SortedCallTargets =
1757 uint64_t Sum = 0;
1758 for (const auto &C : T.get())
1759 Sum += C.second;
1760 // With CSSPGO all indirect call targets are counted torwards the
1761 // original indirect call site in the profile, including both
1762 // inlined and non-inlined targets.
1764 if (const FunctionSamplesMap *M =
1765 FS->findFunctionSamplesMapAt(CallSite)) {
1766 for (const auto &NameFS : *M)
1767 Sum += NameFS.second.getHeadSamplesEstimate();
1768 }
1769 }
1770 if (Sum)
1771 updateIDTMetaData(I, SortedCallTargets, Sum);
1772 else if (OverwriteExistingWeights)
1773 I.setMetadata(LLVMContext::MD_prof, nullptr);
1774 } else if (!isa<IntrinsicInst>(&I)) {
1775 setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])});
1776 }
1777 }
1779 // Set profile metadata (possibly annotated by LTO prelink) to zero or
1780 // clear it for cold code.
1781 for (auto &I : *BB) {
1782 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1783 if (cast<CallBase>(I).isIndirectCall()) {
1784 I.setMetadata(LLVMContext::MD_prof, nullptr);
1785 } else {
1787 }
1788 }
1789 }
1790 }
1791
1792 Instruction *TI = BB->getTerminator();
1793 if (TI->getNumSuccessors() == 1)
1794 continue;
1795 if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
1796 !isa<IndirectBrInst>(TI))
1797 continue;
1798
1799 DebugLoc BranchLoc = TI->getDebugLoc();
1800 LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1801 << ((BranchLoc) ? Twine(BranchLoc.getLine())
1802 : Twine("<UNKNOWN LOCATION>"))
1803 << ".\n");
1805 uint32_t MaxWeight = 0;
1806 Instruction *MaxDestInst;
1807 // Since profi treats multiple edges (multiway branches) as a single edge,
1808 // we need to distribute the computed weight among the branches. We do
1809 // this by evenly splitting the edge weight among destinations.
1811 std::vector<uint64_t> EdgeIndex;
1813 EdgeIndex.resize(TI->getNumSuccessors());
1814 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1815 const BasicBlock *Succ = TI->getSuccessor(I);
1816 EdgeIndex[I] = EdgeMultiplicity[Succ];
1817 EdgeMultiplicity[Succ]++;
1818 }
1819 }
1820 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1821 BasicBlock *Succ = TI->getSuccessor(I);
1822 Edge E = std::make_pair(BB, Succ);
1823 uint64_t Weight = EdgeWeights[E];
1824 LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1825 // Use uint32_t saturated arithmetic to adjust the incoming weights,
1826 // if needed. Sample counts in profiles are 64-bit unsigned values,
1827 // but internally branch weights are expressed as 32-bit values.
1828 if (Weight > std::numeric_limits<uint32_t>::max()) {
1829 LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1830 Weight = std::numeric_limits<uint32_t>::max();
1831 }
1832 if (!SampleProfileUseProfi) {
1833 // Weight is added by one to avoid propagation errors introduced by
1834 // 0 weights.
1835 Weights.push_back(static_cast<uint32_t>(Weight + 1));
1836 } else {
1837 // Profi creates proper weights that do not require "+1" adjustments but
1838 // we evenly split the weight among branches with the same destination.
1839 uint64_t W = Weight / EdgeMultiplicity[Succ];
1840 // Rounding up, if needed, so that first branches are hotter.
1841 if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
1842 W++;
1843 Weights.push_back(static_cast<uint32_t>(W));
1844 }
1845 if (Weight != 0) {
1846 if (Weight > MaxWeight) {
1847 MaxWeight = Weight;
1848 MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1849 }
1850 }
1851 }
1852
1853 misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
1854
1855 uint64_t TempWeight;
1856 // Only set weights if there is at least one non-zero weight.
1857 // In any other case, let the analyzer set weights.
1858 // Do not set weights if the weights are present unless under
1859 // OverwriteExistingWeights. In ThinLTO, the profile annotation is done
1860 // twice. If the first annotation already set the weights, the second pass
1861 // does not need to set it. With OverwriteExistingWeights, Blocks with zero
1862 // weight should have their existing metadata (possibly annotated by LTO
1863 // prelink) cleared.
1864 if (MaxWeight > 0 &&
1865 (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
1866 LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1867 setBranchWeights(*TI, Weights);
1868 ORE->emit([&]() {
1869 return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1870 << "most popular destination for conditional branches at "
1871 << ore::NV("CondBranchesLoc", BranchLoc);
1872 });
1873 } else {
1875 TI->setMetadata(LLVMContext::MD_prof, nullptr);
1876 LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
1877 } else {
1878 LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1879 }
1880 }
1881 }
1882}
1883
1884/// Once all the branch weights are computed, we emit the MD_prof
1885/// metadata on BB using the computed values for each of its branches.
1886///
1887/// \param F The function to query.
1888///
1889/// \returns true if \p F was modified. Returns false, otherwise.
1890bool SampleProfileLoader::emitAnnotations(Function &F) {
1891 bool Changed = false;
1892
1894 if (!ProbeManager->profileIsValid(F, *Samples)) {
1895 LLVM_DEBUG(
1896 dbgs() << "Profile is invalid due to CFG mismatch for Function "
1897 << F.getName() << "\n");
1898 ++NumMismatchedProfile;
1900 return false;
1901 }
1902 ++NumMatchedProfile;
1903 } else {
1904 if (getFunctionLoc(F) == 0)
1905 return false;
1906
1907 LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1908 << F.getName() << ": " << getFunctionLoc(F) << "\n");
1909 }
1910
1911 DenseSet<GlobalValue::GUID> InlinedGUIDs;
1913 Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
1914 else
1915 Changed |= inlineHotFunctions(F, InlinedGUIDs);
1916
1917 Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
1918
1919 if (Changed)
1920 generateMDProfMetadata(F);
1921
1922 emitCoverageRemarks(F);
1923 return Changed;
1924}
1925
1926std::unique_ptr<ProfiledCallGraph>
1927SampleProfileLoader::buildProfiledCallGraph(Module &M) {
1928 std::unique_ptr<ProfiledCallGraph> ProfiledCG;
1930 ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
1931 else
1932 ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
1933
1934 // Add all functions into the profiled call graph even if they are not in
1935 // the profile. This makes sure functions missing from the profile still
1936 // gets a chance to be processed.
1937 for (Function &F : M) {
1939 continue;
1940 ProfiledCG->addProfiledFunction(
1942 }
1943
1944 return ProfiledCG;
1945}
1946
1947std::vector<Function *>
1948SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) {
1949 std::vector<Function *> FunctionOrderList;
1950 FunctionOrderList.reserve(M.size());
1951
1953 errs() << "WARNING: -use-profiled-call-graph ignored, should be used "
1954 "together with -sample-profile-top-down-load.\n";
1955
1956 if (!ProfileTopDownLoad) {
1957 if (ProfileMergeInlinee) {
1958 // Disable ProfileMergeInlinee if profile is not loaded in top down order,
1959 // because the profile for a function may be used for the profile
1960 // annotation of its outline copy before the profile merging of its
1961 // non-inlined inline instances, and that is not the way how
1962 // ProfileMergeInlinee is supposed to work.
1963 ProfileMergeInlinee = false;
1964 }
1965
1966 for (Function &F : M)
1968 FunctionOrderList.push_back(&F);
1969 return FunctionOrderList;
1970 }
1971
1973 !UseProfiledCallGraph.getNumOccurrences())) {
1974 // Use profiled call edges to augment the top-down order. There are cases
1975 // that the top-down order computed based on the static call graph doesn't
1976 // reflect real execution order. For example
1977 //
1978 // 1. Incomplete static call graph due to unknown indirect call targets.
1979 // Adjusting the order by considering indirect call edges from the
1980 // profile can enable the inlining of indirect call targets by allowing
1981 // the caller processed before them.
1982 // 2. Mutual call edges in an SCC. The static processing order computed for
1983 // an SCC may not reflect the call contexts in the context-sensitive
1984 // profile, thus may cause potential inlining to be overlooked. The
1985 // function order in one SCC is being adjusted to a top-down order based
1986 // on the profile to favor more inlining. This is only a problem with CS
1987 // profile.
1988 // 3. Transitive indirect call edges due to inlining. When a callee function
1989 // (say B) is inlined into a caller function (say A) in LTO prelink,
1990 // every call edge originated from the callee B will be transferred to
1991 // the caller A. If any transferred edge (say A->C) is indirect, the
1992 // original profiled indirect edge B->C, even if considered, would not
1993 // enforce a top-down order from the caller A to the potential indirect
1994 // call target C in LTO postlink since the inlined callee B is gone from
1995 // the static call graph.
1996 // 4. #3 can happen even for direct call targets, due to functions defined
1997 // in header files. A header function (say A), when included into source
1998 // files, is defined multiple times but only one definition survives due
1999 // to ODR. Therefore, the LTO prelink inlining done on those dropped
2000 // definitions can be useless based on a local file scope. More
2001 // importantly, the inlinee (say B), once fully inlined to a
2002 // to-be-dropped A, will have no profile to consume when its outlined
2003 // version is compiled. This can lead to a profile-less prelink
2004 // compilation for the outlined version of B which may be called from
2005 // external modules. while this isn't easy to fix, we rely on the
2006 // postlink AutoFDO pipeline to optimize B. Since the survived copy of
2007 // the A can be inlined in its local scope in prelink, it may not exist
2008 // in the merged IR in postlink, and we'll need the profiled call edges
2009 // to enforce a top-down order for the rest of the functions.
2010 //
2011 // Considering those cases, a profiled call graph completely independent of
2012 // the static call graph is constructed based on profile data, where
2013 // function objects are not even needed to handle case #3 and case 4.
2014 //
2015 // Note that static callgraph edges are completely ignored since they
2016 // can be conflicting with profiled edges for cyclic SCCs and may result in
2017 // an SCC order incompatible with profile-defined one. Using strictly
2018 // profile order ensures a maximum inlining experience. On the other hand,
2019 // static call edges are not so important when they don't correspond to a
2020 // context in the profile.
2021
2022 std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M);
2023 scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
2024 while (!CGI.isAtEnd()) {
2025 auto Range = *CGI;
2026 if (SortProfiledSCC) {
2027 // Sort nodes in one SCC based on callsite hotness.
2029 Range = *SI;
2030 }
2031 for (auto *Node : Range) {
2032 Function *F = SymbolMap.lookup(Node->Name);
2033 if (F && !skipProfileForFunction(*F))
2034 FunctionOrderList.push_back(F);
2035 }
2036 ++CGI;
2037 }
2038 } else {
2039 CG.buildRefSCCs();
2040 for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
2041 for (LazyCallGraph::SCC &C : RC) {
2042 for (LazyCallGraph::Node &N : C) {
2043 Function &F = N.getFunction();
2045 FunctionOrderList.push_back(&F);
2046 }
2047 }
2048 }
2049 }
2050
2051 std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
2052
2053 LLVM_DEBUG({
2054 dbgs() << "Function processing order:\n";
2055 for (auto F : FunctionOrderList) {
2056 dbgs() << F->getName() << "\n";
2057 }
2058 });
2059
2060 return FunctionOrderList;
2061}
2062
2063bool SampleProfileLoader::doInitialization(Module &M,
2065 auto &Ctx = M.getContext();
2066
2067 auto ReaderOrErr = SampleProfileReader::create(
2068 Filename, Ctx, *FS, FSDiscriminatorPass::Base, RemappingFilename);
2069 if (std::error_code EC = ReaderOrErr.getError()) {
2070 std::string Msg = "Could not open profile: " + EC.message();
2071 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
2072 return false;
2073 }
2074 Reader = std::move(ReaderOrErr.get());
2075 Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
2076 // set module before reading the profile so reader may be able to only
2077 // read the function profiles which are used by the current module.
2078 Reader->setModule(&M);
2079 if (std::error_code EC = Reader->read()) {
2080 std::string Msg = "profile reading failed: " + EC.message();
2081 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
2082 return false;
2083 }
2084
2085 PSL = Reader->getProfileSymbolList();
2086
2087 // While profile-sample-accurate is on, ignore symbol list.
2088 ProfAccForSymsInList =
2090 if (ProfAccForSymsInList) {
2091 NamesInProfile.clear();
2092 GUIDsInProfile.clear();
2093 if (auto NameTable = Reader->getNameTable()) {
2095 for (auto Name : *NameTable)
2096 GUIDsInProfile.insert(Name.getHashCode());
2097 } else {
2098 for (auto Name : *NameTable)
2099 NamesInProfile.insert(Name.stringRef());
2100 }
2101 }
2102 CoverageTracker.setProfAccForSymsInList(true);
2103 }
2104
2105 if (FAM && !ProfileInlineReplayFile.empty()) {
2106 ExternalInlineAdvisor = getReplayInlineAdvisor(
2107 M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
2112 /*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});
2113 }
2114
2115 // Apply tweaks if context-sensitive or probe-based profile is available.
2116 if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||
2117 Reader->profileIsProbeBased()) {
2118 if (!UseIterativeBFIInference.getNumOccurrences())
2120 if (!SampleProfileUseProfi.getNumOccurrences())
2121 SampleProfileUseProfi = true;
2122 if (!EnableExtTspBlockPlacement.getNumOccurrences())
2124 // Enable priority-base inliner and size inline by default for CSSPGO.
2125 if (!ProfileSizeInline.getNumOccurrences())
2126 ProfileSizeInline = true;
2127 if (!CallsitePrioritizedInline.getNumOccurrences())
2129 // For CSSPGO, we also allow recursive inline to best use context profile.
2130 if (!AllowRecursiveInline.getNumOccurrences())
2131 AllowRecursiveInline = true;
2132
2133 if (Reader->profileIsPreInlined()) {
2134 if (!UsePreInlinerDecision.getNumOccurrences())
2135 UsePreInlinerDecision = true;
2136 }
2137
2138 // Enable stale profile matching by default for probe-based profile.
2139 // Currently the matching relies on if the checksum mismatch is detected,
2140 // which is currently only available for pseudo-probe mode. Removing the
2141 // checksum check could cause regressions for some cases, so further tuning
2142 // might be needed if we want to enable it for all cases.
2143 if (Reader->profileIsProbeBased() &&
2144 !SalvageStaleProfile.getNumOccurrences()) {
2145 SalvageStaleProfile = true;
2146 }
2147
2148 if (!Reader->profileIsCS()) {
2149 // Non-CS profile should be fine without a function size budget for the
2150 // inliner since the contexts in the profile are either all from inlining
2151 // in the prevoius build or pre-computed by the preinliner with a size
2152 // cap, thus they are bounded.
2153 if (!ProfileInlineLimitMin.getNumOccurrences())
2154 ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();
2155 if (!ProfileInlineLimitMax.getNumOccurrences())
2156 ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();
2157 }
2158 }
2159
2160 if (Reader->profileIsCS()) {
2161 // Tracker for profiles under different context
2162 ContextTracker = std::make_unique<SampleContextTracker>(
2163 Reader->getProfiles(), &GUIDToFuncNameMap);
2164 }
2165
2166 // Load pseudo probe descriptors for probe-based function samples.
2167 if (Reader->profileIsProbeBased()) {
2168 ProbeManager = std::make_unique<PseudoProbeManager>(M);
2169 if (!ProbeManager->moduleIsProbed(M)) {
2170 const char *Msg =
2171 "Pseudo-probe-based profile requires SampleProfileProbePass";
2172 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
2173 DS_Warning));
2174 return false;
2175 }
2176 }
2177
2180 MatchingManager =
2181 std::make_unique<SampleProfileMatcher>(M, *Reader, ProbeManager.get());
2182 }
2183
2184 return true;
2185}
2186
2187void SampleProfileMatcher::findIRAnchors(
2188 const Function &F, std::map<LineLocation, StringRef> &IRAnchors) {
2189 // For inlined code, recover the original callsite and callee by finding the
2190 // top-level inline frame. e.g. For frame stack "main:1 @ foo:2 @ bar:3", the
2191 // top-level frame is "main:1", the callsite is "1" and the callee is "foo".
2192 auto FindTopLevelInlinedCallsite = [](const DILocation *DIL) {
2193 assert((DIL && DIL->getInlinedAt()) && "No inlined callsite");
2194 const DILocation *PrevDIL = nullptr;
2195 do {
2196 PrevDIL = DIL;
2197 DIL = DIL->getInlinedAt();
2198 } while (DIL->getInlinedAt());
2199
2201 StringRef CalleeName = PrevDIL->getSubprogramLinkageName();
2202 return std::make_pair(Callsite, CalleeName);
2203 };
2204
2205 auto GetCanonicalCalleeName = [](const CallBase *CB) {
2206 StringRef CalleeName = UnknownIndirectCallee;
2207 if (Function *Callee = CB->getCalledFunction())
2208 CalleeName = FunctionSamples::getCanonicalFnName(Callee->getName());
2209 return CalleeName;
2210 };
2211
2212 // Extract profile matching anchors in the IR.
2213 for (auto &BB : F) {
2214 for (auto &I : BB) {
2215 DILocation *DIL = I.getDebugLoc();
2216 if (!DIL)
2217 continue;
2218
2220 if (auto Probe = extractProbe(I)) {
2221 // Flatten inlined IR for the matching.
2222 if (DIL->getInlinedAt()) {
2223 IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL));
2224 } else {
2225 // Use empty StringRef for basic block probe.
2226 StringRef CalleeName;
2227 if (const auto *CB = dyn_cast<CallBase>(&I)) {
2228 // Skip the probe inst whose callee name is "llvm.pseudoprobe".
2229 if (!isa<IntrinsicInst>(&I))
2230 CalleeName = GetCanonicalCalleeName(CB);
2231 }
2232 IRAnchors.emplace(LineLocation(Probe->Id, 0), CalleeName);
2233 }
2234 }
2235 } else {
2236 // TODO: For line-number based profile(AutoFDO), currently only support
2237 // find callsite anchors. In future, we need to parse all the non-call
2238 // instructions to extract the line locations for profile matching.
2239 if (!isa<CallBase>(&I) || isa<IntrinsicInst>(&I))
2240 continue;
2241
2242 if (DIL->getInlinedAt()) {
2243 IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL));
2244 } else {
2246 StringRef CalleeName = GetCanonicalCalleeName(dyn_cast<CallBase>(&I));
2247 IRAnchors.emplace(Callsite, CalleeName);
2248 }
2249 }
2250 }
2251 }
2252}
2253
2254void SampleProfileMatcher::findProfileAnchors(
2255 const FunctionSamples &FS,
2256 std::map<LineLocation, std::unordered_set<FunctionId>> &ProfileAnchors) {
2257 auto isInvalidLineOffset = [](uint32_t LineOffset) {
2258 return LineOffset & 0x8000;
2259 };
2260
2261 for (const auto &I : FS.getBodySamples()) {
2262 const LineLocation &Loc = I.first;
2263 if (isInvalidLineOffset(Loc.LineOffset))
2264 continue;
2265 for (const auto &I : I.second.getCallTargets()) {
2266 auto Ret = ProfileAnchors.try_emplace(Loc,
2267 std::unordered_set<FunctionId>());
2268 Ret.first->second.insert(I.first);
2269 }
2270 }
2271
2272 for (const auto &I : FS.getCallsiteSamples()) {
2273 const LineLocation &Loc = I.first;
2274 if (isInvalidLineOffset(Loc.LineOffset))
2275 continue;
2276 const auto &CalleeMap = I.second;
2277 for (const auto &I : CalleeMap) {
2278 auto Ret = ProfileAnchors.try_emplace(Loc,
2279 std::unordered_set<FunctionId>());
2280 Ret.first->second.insert(I.first);
2281 }
2282 }
2283}
2284
2285// Call target name anchor based profile fuzzy matching.
2286// Input:
2287// For IR locations, the anchor is the callee name of direct callsite; For
2288// profile locations, it's the call target name for BodySamples or inlinee's
2289// profile name for CallsiteSamples.
2290// Matching heuristic:
2291// First match all the anchors in lexical order, then split the non-anchor
2292// locations between the two anchors evenly, first half are matched based on the
2293// start anchor, second half are matched based on the end anchor.
2294// For example, given:
2295// IR locations: [1, 2(foo), 3, 5, 6(bar), 7]
2296// Profile locations: [1, 2, 3(foo), 4, 7, 8(bar), 9]
2297// The matching gives:
2298// [1, 2(foo), 3, 5, 6(bar), 7]
2299// | | | | | |
2300// [1, 2, 3(foo), 4, 7, 8(bar), 9]
2301// The output mapping: [2->3, 3->4, 5->7, 6->8, 7->9].
2302void SampleProfileMatcher::runStaleProfileMatching(
2303 const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
2304 const std::map<LineLocation, std::unordered_set<FunctionId>>
2305 &ProfileAnchors,
2306 LocToLocMap &IRToProfileLocationMap) {
2307 LLVM_DEBUG(dbgs() << "Run stale profile matching for " << F.getName()
2308 << "\n");
2309 assert(IRToProfileLocationMap.empty() &&
2310 "Run stale profile matching only once per function");
2311
2312 std::unordered_map<FunctionId, std::set<LineLocation>>
2313 CalleeToCallsitesMap;
2314 for (const auto &I : ProfileAnchors) {
2315 const auto &Loc = I.first;
2316 const auto &Callees = I.second;
2317 // Filter out possible indirect calls, use direct callee name as anchor.
2318 if (Callees.size() == 1) {
2319 FunctionId CalleeName = *Callees.begin();
2320 const auto &Candidates = CalleeToCallsitesMap.try_emplace(
2321 CalleeName, std::set<LineLocation>());
2322 Candidates.first->second.insert(Loc);
2323 }
2324 }
2325
2326 auto InsertMatching = [&](const LineLocation &From, const LineLocation &To) {
2327 // Skip the unchanged location mapping to save memory.
2328 if (From != To)
2329 IRToProfileLocationMap.insert({From, To});
2330 };
2331
2332 // Use function's beginning location as the initial anchor.
2333 int32_t LocationDelta = 0;
2334 SmallVector<LineLocation> LastMatchedNonAnchors;
2335
2336 for (const auto &IR : IRAnchors) {
2337 const auto &Loc = IR.first;
2338 auto CalleeName = IR.second;
2339 bool IsMatchedAnchor = false;
2340 // Match the anchor location in lexical order.
2341 if (!CalleeName.empty()) {
2342 auto CandidateAnchors = CalleeToCallsitesMap.find(
2343 getRepInFormat(CalleeName));
2344 if (CandidateAnchors != CalleeToCallsitesMap.end() &&
2345 !CandidateAnchors->second.empty()) {
2346 auto CI = CandidateAnchors->second.begin();
2347 const auto Candidate = *CI;
2348 CandidateAnchors->second.erase(CI);
2349 InsertMatching(Loc, Candidate);
2350 LLVM_DEBUG(dbgs() << "Callsite with callee:" << CalleeName
2351 << " is matched from " << Loc << " to " << Candidate
2352 << "\n");
2353 LocationDelta = Candidate.LineOffset - Loc.LineOffset;
2354
2355 // Match backwards for non-anchor locations.
2356 // The locations in LastMatchedNonAnchors have been matched forwards
2357 // based on the previous anchor, spilt it evenly and overwrite the
2358 // second half based on the current anchor.
2359 for (size_t I = (LastMatchedNonAnchors.size() + 1) / 2;
2360 I < LastMatchedNonAnchors.size(); I++) {
2361 const auto &L = LastMatchedNonAnchors[I];
2362 uint32_t CandidateLineOffset = L.LineOffset + LocationDelta;
2363 LineLocation Candidate(CandidateLineOffset, L.Discriminator);
2364 InsertMatching(L, Candidate);
2365 LLVM_DEBUG(dbgs() << "Location is rematched backwards from " << L
2366 << " to " << Candidate << "\n");
2367 }
2368
2369 IsMatchedAnchor = true;
2370 LastMatchedNonAnchors.clear();
2371 }
2372 }
2373
2374 // Match forwards for non-anchor locations.
2375 if (!IsMatchedAnchor) {
2376 uint32_t CandidateLineOffset = Loc.LineOffset + LocationDelta;
2377 LineLocation Candidate(CandidateLineOffset, Loc.Discriminator);
2378 InsertMatching(Loc, Candidate);
2379 LLVM_DEBUG(dbgs() << "Location is matched from " << Loc << " to "
2380 << Candidate << "\n");
2381 LastMatchedNonAnchors.emplace_back(Loc);
2382 }
2383 }
2384}
2385
2386void SampleProfileMatcher::runOnFunction(const Function &F) {
2387 // We need to use flattened function samples for matching.
2388 // Unlike IR, which includes all callsites from the source code, the callsites
2389 // in profile only show up when they are hit by samples, i,e. the profile
2390 // callsites in one context may differ from those in another context. To get
2391 // the maximum number of callsites, we merge the function profiles from all
2392 // contexts, aka, the flattened profile to find profile anchors.
2393 const auto *FSFlattened = getFlattenedSamplesFor(F);
2394 if (!FSFlattened)
2395 return;
2396
2397 // Anchors for IR. It's a map from IR location to callee name, callee name is
2398 // empty for non-call instruction and use a dummy name(UnknownIndirectCallee)
2399 // for unknown indrect callee name.
2400 std::map<LineLocation, StringRef> IRAnchors;
2401 findIRAnchors(F, IRAnchors);
2402 // Anchors for profile. It's a map from callsite location to a set of callee
2403 // name.
2404 std::map<LineLocation, std::unordered_set<FunctionId>> ProfileAnchors;
2405 findProfileAnchors(*FSFlattened, ProfileAnchors);
2406
2407 // Compute the callsite match states for profile staleness report.
2409 recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors, nullptr);
2410
2411 // Run profile matching for checksum mismatched profile, currently only
2412 // support for pseudo-probe.
2414 !ProbeManager->profileIsValid(F, *FSFlattened)) {
2415 // The matching result will be saved to IRToProfileLocationMap, create a new
2416 // map for each function.
2417 auto &IRToProfileLocationMap = getIRToProfileLocationMap(F);
2418 runStaleProfileMatching(F, IRAnchors, ProfileAnchors,
2419 IRToProfileLocationMap);
2420 // Find and update callsite match states after matching.
2422 recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors,
2423 &IRToProfileLocationMap);
2424 }
2425}
2426
2427void SampleProfileMatcher::recordCallsiteMatchStates(
2428 const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
2429 const std::map<LineLocation, std::unordered_set<FunctionId>>
2430 &ProfileAnchors,
2431 const LocToLocMap *IRToProfileLocationMap) {
2432 bool IsPostMatch = IRToProfileLocationMap != nullptr;
2433 auto &CallsiteMatchStates =
2434 FuncCallsiteMatchStates[FunctionSamples::getCanonicalFnName(F.getName())];
2435
2436 auto MapIRLocToProfileLoc = [&](const LineLocation &IRLoc) {
2437 // IRToProfileLocationMap is null in pre-match phrase.
2438 if (!IRToProfileLocationMap)
2439 return IRLoc;
2440 const auto &ProfileLoc = IRToProfileLocationMap->find(IRLoc);
2441 if (ProfileLoc != IRToProfileLocationMap->end())
2442 return ProfileLoc->second;
2443 else
2444 return IRLoc;
2445 };
2446
2447 for (const auto &I : IRAnchors) {
2448 // After fuzzy profile matching, use the matching result to remap the
2449 // current IR callsite.
2450 const auto &ProfileLoc = MapIRLocToProfileLoc(I.first);
2451 const auto &IRCalleeName = I.second;
2452 const auto &It = ProfileAnchors.find(ProfileLoc);
2453 if (It == ProfileAnchors.end())
2454 continue;
2455 const auto &Callees = It->second;
2456
2457 bool IsCallsiteMatched = false;
2458 // Since indirect call does not have CalleeName, check conservatively if
2459 // callsite in the profile is a callsite location. This is to reduce num of
2460 // false positive since otherwise all the indirect call samples will be
2461 // reported as mismatching.
2462 if (IRCalleeName == SampleProfileMatcher::UnknownIndirectCallee)
2463 IsCallsiteMatched = true;
2464 else if (Callees.size() == 1 && Callees.count(getRepInFormat(IRCalleeName)))
2465 IsCallsiteMatched = true;
2466
2467 if (IsCallsiteMatched) {
2468 auto It = CallsiteMatchStates.find(ProfileLoc);
2469 if (It == CallsiteMatchStates.end())
2470 CallsiteMatchStates.emplace(ProfileLoc, MatchState::InitialMatch);
2471 else if (IsPostMatch) {
2472 if (It->second == MatchState::InitialMatch)
2473 It->second = MatchState::UnchangedMatch;
2474 else if (It->second == MatchState::InitialMismatch)
2475 It->second = MatchState::RecoveredMismatch;
2476 }
2477 }
2478 }
2479
2480 // Check if there are any callsites in the profile that does not match to any
2481 // IR callsites.
2482 for (const auto &I : ProfileAnchors) {
2483 const auto &Loc = I.first;
2484 [[maybe_unused]] const auto &Callees = I.second;
2485 assert(!Callees.empty() && "Callees should not be empty");
2486 auto It = CallsiteMatchStates.find(Loc);
2487 if (It == CallsiteMatchStates.end())
2488 CallsiteMatchStates.emplace(Loc, MatchState::InitialMismatch);
2489 else if (IsPostMatch) {
2490 // Update the state if it's not matched(UnchangedMatch or
2491 // RecoveredMismatch).
2492 if (It->second == MatchState::InitialMismatch)
2493 It->second = MatchState::UnchangedMismatch;
2494 else if (It->second == MatchState::InitialMatch)
2495 It->second = MatchState::RemovedMatch;
2496 }
2497 }
2498}
2499
2500void SampleProfileMatcher::countMismatchedFuncSamples(const FunctionSamples &FS,
2501 bool IsTopLevel) {
2502 const auto *FuncDesc = ProbeManager->getDesc(FS.getGUID());
2503 // Skip the function that is external or renamed.
2504 if (!FuncDesc)
2505 return;
2506
2507 if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS)) {
2508 if (IsTopLevel)
2509 NumStaleProfileFunc++;
2510 // Given currently all probe ids are after block probe ids, once the
2511 // checksum is mismatched, it's likely all the callites are mismatched and
2512 // dropped. We conservatively count all the samples as mismatched and stop
2513 // counting the inlinees' profiles.
2514 MismatchedFunctionSamples += FS.getTotalSamples();
2515 return;
2516 }
2517
2518 // Even the current-level function checksum is matched, it's possible that the
2519 // nested inlinees' checksums are mismatched that affect the inlinee's sample
2520 // loading, we need to go deeper to check the inlinees' function samples.
2521 // Similarly, count all the samples as mismatched if the inlinee's checksum is
2522 // mismatched using this recursive function.
2523 for (const auto &I : FS.getCallsiteSamples())
2524 for (const auto &CS : I.second)
2525 countMismatchedFuncSamples(CS.second, false);
2526}
2527
2528void SampleProfileMatcher::countMismatchedCallsiteSamples(
2529 const FunctionSamples &FS) {
2530 auto It = FuncCallsiteMatchStates.find(FS.getFuncName());
2531 // Skip it if no mismatched callsite or this is an external function.
2532 if (It == FuncCallsiteMatchStates.end() || It->second.empty())
2533 return;
2534 const auto &CallsiteMatchStates = It->second;
2535
2536 auto findMatchState = [&](const LineLocation &Loc) {
2537 auto It = CallsiteMatchStates.find(Loc);
2538 if (It == CallsiteMatchStates.end())
2539 return MatchState::Unknown;
2540 return It->second;
2541 };
2542
2543 auto AttributeMismatchedSamples = [&](const enum MatchState &State,
2544 uint64_t Samples) {
2545 if (isMismatchState(State))
2546 MismatchedCallsiteSamples += Samples;
2547 else if (State == MatchState::RecoveredMismatch)
2548 RecoveredCallsiteSamples += Samples;
2549 };
2550
2551 // The non-inlined callsites are saved in the body samples of function
2552 // profile, go through it to count the non-inlined callsite samples.
2553 for (const auto &I : FS.getBodySamples())
2554 AttributeMismatchedSamples(findMatchState(I.first), I.second.getSamples());
2555
2556 // Count the inlined callsite samples.
2557 for (const auto &I : FS.getCallsiteSamples()) {
2558 auto State = findMatchState(I.first);
2559 uint64_t CallsiteSamples = 0;
2560 for (const auto &CS : I.second)
2561 CallsiteSamples += CS.second.getTotalSamples();
2562 AttributeMismatchedSamples(State, CallsiteSamples);
2563
2564 if (isMismatchState(State))
2565 continue;
2566
2567 // When the current level of inlined call site matches the profiled call
2568 // site, we need to go deeper along the inline tree to count mismatches from
2569 // lower level inlinees.
2570 for (const auto &CS : I.second)
2571 countMismatchedCallsiteSamples(CS.second);
2572 }
2573}
2574
2575void SampleProfileMatcher::countMismatchCallsites(const FunctionSamples &FS) {
2576 auto It = FuncCallsiteMatchStates.find(FS.getFuncName());
2577 // Skip it if no mismatched callsite or this is an external function.
2578 if (It == FuncCallsiteMatchStates.end() || It->second.empty())
2579 return;
2580 const auto &MatchStates = It->second;
2581 [[maybe_unused]] bool OnInitialState =
2582 isInitialState(MatchStates.begin()->second);
2583 for (const auto &I : MatchStates) {
2584 TotalProfiledCallsites++;
2585 assert(
2586 (OnInitialState ? isInitialState(I.second) : isFinalState(I.second)) &&
2587 "Profile matching state is inconsistent");
2588
2589 if (isMismatchState(I.second))
2590 NumMismatchedCallsites++;
2591 else if (I.second == MatchState::RecoveredMismatch)
2592 NumRecoveredCallsites++;
2593 }
2594}
2595
2596void SampleProfileMatcher::computeAndReportProfileStaleness() {
2598 return;
2599
2600 // Count profile mismatches for profile staleness report.
2601 for (const auto &F : M) {
2603 continue;
2604 // As the stats will be merged by linker, skip reporting the metrics for
2605 // imported functions to avoid repeated counting.
2607 continue;
2608 const auto *FS = Reader.getSamplesFor(F);
2609 if (!FS)
2610 continue;
2611 TotalProfiledFunc++;
2612 TotalFunctionSamples += FS->getTotalSamples();
2613
2614 // Checksum mismatch is only used in pseudo-probe mode.
2616 countMismatchedFuncSamples(*FS, true);
2617
2618 // Count mismatches and samples for calliste.
2619 countMismatchCallsites(*FS);
2620 countMismatchedCallsiteSamples(*FS);
2621 }
2622
2625 errs() << "(" << NumStaleProfileFunc << "/" << TotalProfiledFunc << ")"
2626 << " of functions' profile are invalid and "
2627 << " (" << MismatchedFunctionSamples << "/" << TotalFunctionSamples
2628 << ") of samples are discarded due to function hash mismatch.\n";
2629 }
2630 errs() << "(" << (NumMismatchedCallsites + NumRecoveredCallsites) << "/"
2631 << TotalProfiledCallsites << ")"
2632 << " of callsites' profile are invalid and "
2633 << "(" << (MismatchedCallsiteSamples + RecoveredCallsiteSamples)
2634 << "/" << TotalFunctionSamples << ")"
2635 << " of samples are discarded due to callsite location mismatch.\n";
2636 errs() << "(" << NumRecoveredCallsites << "/"
2637 << (NumRecoveredCallsites + NumMismatchedCallsites) << ")"
2638 << " of callsites and "
2639 << "(" << RecoveredCallsiteSamples << "/"
2640 << (RecoveredCallsiteSamples + MismatchedCallsiteSamples) << ")"
2641 << " of samples are recovered by stale profile matching.\n";
2642 }
2643
2645 LLVMContext &Ctx = M.getContext();
2646 MDBuilder MDB(Ctx);
2647
2650 ProfStatsVec.emplace_back("NumStaleProfileFunc", NumStaleProfileFunc);
2651 ProfStatsVec.emplace_back("TotalProfiledFunc", TotalProfiledFunc);
2652 ProfStatsVec.emplace_back("MismatchedFunctionSamples",
2653 MismatchedFunctionSamples);
2654 ProfStatsVec.emplace_back("TotalFunctionSamples", TotalFunctionSamples);
2655 }
2656
2657 ProfStatsVec.emplace_back("NumMismatchedCallsites", NumMismatchedCallsites);
2658 ProfStatsVec.emplace_back("NumRecoveredCallsites", NumRecoveredCallsites);
2659 ProfStatsVec.emplace_back("TotalProfiledCallsites", TotalProfiledCallsites);
2660 ProfStatsVec.emplace_back("MismatchedCallsiteSamples",
2661 MismatchedCallsiteSamples);
2662 ProfStatsVec.emplace_back("RecoveredCallsiteSamples",
2663 RecoveredCallsiteSamples);
2664
2665 auto *MD = MDB.createLLVMStats(ProfStatsVec);
2666 auto *NMD = M.getOrInsertNamedMetadata("llvm.stats");
2667 NMD->addOperand(MD);
2668 }
2669}
2670
2671void SampleProfileMatcher::runOnModule() {
2672 ProfileConverter::flattenProfile(Reader.getProfiles(), FlattenedProfiles,
2674 for (auto &F : M) {
2676 continue;
2678 }
2680 distributeIRToProfileLocationMap();
2681
2682 computeAndReportProfileStaleness();
2683}
2684
2685void SampleProfileMatcher::distributeIRToProfileLocationMap(
2686 FunctionSamples &FS) {
2687 const auto ProfileMappings = FuncMappings.find(FS.getFuncName());
2688 if (ProfileMappings != FuncMappings.end()) {
2689 FS.setIRToProfileLocationMap(&(ProfileMappings->second));
2690 }
2691
2692 for (auto &Inlinees : FS.getCallsiteSamples()) {
2693 for (auto FS : Inlinees.second) {
2694 distributeIRToProfileLocationMap(FS.second);
2695 }
2696 }
2697}
2698
2699// Use a central place to distribute the matching results. Outlined and inlined
2700// profile with the function name will be set to the same pointer.
2701void SampleProfileMatcher::distributeIRToProfileLocationMap() {
2702 for (auto &I : Reader.getProfiles()) {
2703 distributeIRToProfileLocationMap(I.second);
2704 }
2705}
2706
2707bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
2708 ProfileSummaryInfo *_PSI,
2709 LazyCallGraph &CG) {
2710 GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
2711
2712 PSI = _PSI;
2713 if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
2714 M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
2716 PSI->refresh();
2717 }
2718 // Compute the total number of samples collected in this profile.
2719 for (const auto &I : Reader->getProfiles())
2720 TotalCollectedSamples += I.second.getTotalSamples();
2721
2722 auto Remapper = Reader->getRemapper();
2723 // Populate the symbol map.
2724 for (const auto &N_F : M.getValueSymbolTable()) {
2725 StringRef OrigName = N_F.getKey();
2726 Function *F = dyn_cast<Function>(N_F.getValue());
2727 if (F == nullptr || OrigName.empty())
2728 continue;
2729 SymbolMap[FunctionId(OrigName)] = F;
2731 if (OrigName != NewName && !NewName.empty()) {
2732 auto r = SymbolMap.emplace(FunctionId(NewName), F);
2733 // Failiing to insert means there is already an entry in SymbolMap,
2734 // thus there are multiple functions that are mapped to the same
2735 // stripped name. In this case of name conflicting, set the value
2736 // to nullptr to avoid confusion.
2737 if (!r.second)
2738 r.first->second = nullptr;
2739 OrigName = NewName;
2740 }
2741 // Insert the remapped names into SymbolMap.
2742 if (Remapper) {
2743 if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
2744 if (*MapName != OrigName && !MapName->empty())
2745 SymbolMap.emplace(FunctionId(*MapName), F);
2746 }
2747 }
2748 }
2749 assert(SymbolMap.count(FunctionId()) == 0 &&
2750 "No empty StringRef should be added in SymbolMap");
2751
2754 MatchingManager->runOnModule();
2755 MatchingManager->clearMatchingData();
2756 }
2757
2758 bool retval = false;
2759 for (auto *F : buildFunctionOrder(M, CG)) {
2760 assert(!F->isDeclaration());
2761 clearFunctionData();
2762 retval |= runOnFunction(*F, AM);
2763 }
2764
2765 // Account for cold calls not inlined....
2767 for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
2768 notInlinedCallInfo)
2769 updateProfileCallee(pair.first, pair.second.entryCount);
2770
2771 return retval;
2772}
2773
2774bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
2775 LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
2776 DILocation2SampleMap.clear();
2777 // By default the entry count is initialized to -1, which will be treated
2778 // conservatively by getEntryCount as the same as unknown (None). This is
2779 // to avoid newly added code to be treated as cold. If we have samples
2780 // this will be overwritten in emitAnnotations.
2781 uint64_t initialEntryCount = -1;
2782
2783 ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
2784 if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
2785 // initialize all the function entry counts to 0. It means all the
2786 // functions without profile will be regarded as cold.
2787 initialEntryCount = 0;
2788 // profile-sample-accurate is a user assertion which has a higher precedence
2789 // than symbol list. When profile-sample-accurate is on, ignore symbol list.
2790 ProfAccForSymsInList = false;
2791 }
2792 CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
2793
2794 // PSL -- profile symbol list include all the symbols in sampled binary.
2795 // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
2796 // old functions without samples being cold, without having to worry
2797 // about new and hot functions being mistakenly treated as cold.
2798 if (ProfAccForSymsInList) {
2799 // Initialize the entry count to 0 for functions in the list.
2800 if (PSL->contains(F.getName()))
2801 initialEntryCount = 0;
2802
2803 // Function in the symbol list but without sample will be regarded as
2804 // cold. To minimize the potential negative performance impact it could
2805 // have, we want to be a little conservative here saying if a function
2806 // shows up in the profile, no matter as outline function, inline instance
2807 // or call targets, treat the function as not being cold. This will handle
2808 // the cases such as most callsites of a function are inlined in sampled
2809 // binary but not inlined in current build (because of source code drift,
2810 // imprecise debug information, or the callsites are all cold individually
2811 // but not cold accumulatively...), so the outline function showing up as
2812 // cold in sampled binary will actually not be cold after current build.
2815 GUIDsInProfile.count(Function::getGUID(CanonName))) ||
2816 (!FunctionSamples::UseMD5 && NamesInProfile.count(CanonName)))
2817 initialEntryCount = -1;
2818 }
2819
2820 // Initialize entry count when the function has no existing entry
2821 // count value.
2822 if (!F.getEntryCount())
2823 F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
2824 std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
2825 if (AM) {
2826 auto &FAM =
2828 .getManager();
2830 } else {
2831 OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
2832 ORE = OwnedORE.get();
2833 }
2834
2836 Samples = ContextTracker->getBaseSamplesFor(F);
2837 else {
2838 Samples = Reader->getSamplesFor(F);
2839 // Try search in previously inlined functions that were split or duplicated
2840 // into base.
2841 if (!Samples) {
2843 auto It = OutlineFunctionSamples.find(FunctionId(CanonName));
2844 if (It != OutlineFunctionSamples.end()) {
2845 Samples = &It->second;
2846 } else if (auto Remapper = Reader->getRemapper()) {
2847 if (auto RemppedName = Remapper->lookUpNameInProfile(CanonName)) {
2848 It = OutlineFunctionSamples.find(FunctionId(*RemppedName));
2849 if (It != OutlineFunctionSamples.end())
2850 Samples = &It->second;
2851 }
2852 }
2853 }
2854 }
2855
2856 if (Samples && !Samples->empty())
2857 return emitAnnotations(F);
2858 return false;
2859}
2861 std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase,
2863 : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
2864 LTOPhase(LTOPhase), FS(std::move(FS)) {}
2865
2870
2871 auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
2873 };
2874 auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
2876 };
2877 auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
2879 };
2880
2881 if (!FS)
2883
2884 SampleProfileLoader SampleLoader(
2885 ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
2886 ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
2887 : ProfileRemappingFileName,
2888 LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI);
2889
2890 if (!SampleLoader.doInitialization(M, &FAM))
2891 return PreservedAnalyses::all();
2892
2895 if (!SampleLoader.runOnModule(M, &AM, PSI, CG))
2896 return PreservedAnalyses::all();
2897
2898 return PreservedAnalyses::none();
2899}
This file defines the StringMap class.
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
static bool runOnFunction(Function &F, bool PostInlining)
Provides ErrorOr<T> smart pointer.
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
LVReader * CurrentReader
Definition: LVReader.cpp:153
Implements a lazy call graph analysis and related passes for the new pass manager.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PriorityQueue class.
This file contains the declarations for profiling metadata utility functions.
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides the interface for context-sensitive profile tracker used by CSSPGO.
This file provides the interface for the sampled PGO profile loader base implementation.
This file provides the utility functions for the sampled PGO loader base implementation.
This file provides the interface for the pseudo probe implementation for AutoFDO.
static cl::opt< std::string > SampleProfileFile("sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden)
static cl::opt< bool > ProfileSampleBlockAccurate("profile-sample-block-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "branches and calls as having 0 samples. Otherwise, treat " "them conservatively as unknown. "))
static cl::opt< unsigned > MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden, cl::desc("Max number of promotions for a single indirect " "call callsite in sample profile loader"))
static cl::opt< ReplayInlinerSettings::Fallback > ProfileInlineReplayFallback("sample-profile-inline-replay-fallback", cl::init(ReplayInlinerSettings::Fallback::Original), cl::values(clEnumValN(ReplayInlinerSettings::Fallback::Original, "Original", "All decisions not in replay send to original advisor (default)"), clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, "AlwaysInline", "All decisions not in replay are inlined"), clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline", "All decisions not in replay are not inlined")), cl::desc("How sample profile inline replay treats sites that don't come " "from the replay. Original: defers to original advisor, " "AlwaysInline: inline all sites not in replay, NeverInline: " "inline no sites not in replay"), cl::Hidden)
static cl::opt< bool > OverwriteExistingWeights("overwrite-existing-weights", cl::Hidden, cl::init(false), cl::desc("Ignore existing branch weights on IR and always overwrite."))
static void updateIDTMetaData(Instruction &Inst, const SmallVectorImpl< InstrProfValueData > &CallTargets, uint64_t Sum)
Update indirect call target profile metadata for Inst.
static cl::opt< bool > AnnotateSampleProfileInlinePhase("annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false), cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " "sample-profile inline pass name."))
static cl::opt< std::string > ProfileInlineReplayFile("sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"), cl::desc("Optimization remarks file containing inline remarks to be replayed " "by inlining from sample profile loader."), cl::Hidden)
static cl::opt< bool > ProfileMergeInlinee("sample-profile-merge-inlinee", cl::Hidden, cl::init(true), cl::desc("Merge past inlinee's profile to outline version if sample " "profile loader decided not to inline a call site. It will " "only be enabled when top-down order of profile loading is " "enabled. "))
static cl::opt< bool > PersistProfileStaleness("persist-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute stale profile statistical metrics and write it into the " "native object file(.llvm_stats section)."))
static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate)
Check whether the indirect call promotion history of Inst allows the promotion for Candidate.
static SmallVector< InstrProfValueData, 2 > GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M)
Returns the sorted CallTargetMap M by count in descending order.
#define CSINLINE_DEBUG
static cl::opt< bool > UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden, cl::desc("Process functions in a top-down order " "defined by the profiled call graph when " "-sample-profile-top-down-load is on."))
static cl::opt< ReplayInlinerSettings::Scope > ProfileInlineReplayScope("sample-profile-inline-replay-scope", cl::init(ReplayInlinerSettings::Scope::Function), cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function", "Replay on functions that have remarks associated " "with them (default)"), clEnumValN(ReplayInlinerSettings::Scope::Module, "Module", "Replay on the entire module")), cl::desc("Whether inline replay should be applied to the entire " "Module or just the Functions (default) that are present as " "callers in remarks during sample profile inlining."), cl::Hidden)
static cl::opt< unsigned > ProfileICPRelativeHotness("sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25), cl::desc("Relative hotness percentage threshold for indirect " "call promotion in proirity-based sample profile loader inlining."))
Function::ProfileCount ProfileCount
static cl::opt< unsigned > ProfileICPRelativeHotnessSkip("sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1), cl::desc("Skip relative hotness check for ICP up to given number of targets."))
static cl::opt< bool > ReportProfileStaleness("report-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute and report stale profile statistical metrics."))
static cl::opt< bool > UsePreInlinerDecision("sample-profile-use-preinliner", cl::Hidden, cl::desc("Use the preinliner decisions stored in profile context."))
static cl::opt< bool > ProfileAccurateForSymsInList("profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. "))
#define DEBUG_TYPE
static cl::opt< bool > DisableSampleLoaderInlining("disable-sample-loader-inlining", cl::Hidden, cl::init(false), cl::desc("If true, artifically skip inline transformation in sample-loader " "pass, and merge (or scale) profiles (as configured by " "--sample-profile-merge-inlinee)."))
static cl::opt< bool > ProfileSizeInline("sample-profile-inline-size", cl::Hidden, cl::init(false), cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size."))
static cl::opt< bool > SalvageStaleProfile("salvage-stale-profile", cl::Hidden, cl::init(false), cl::desc("Salvage stale profile by fuzzy matching and use the remapped " "location for sample profile query."))
static cl::opt< bool > ProfileTopDownLoad("sample-profile-top-down-load", cl::Hidden, cl::init(true), cl::desc("Do profile annotation and inlining for functions in top-down " "order of call graph during sample profile loading. It only " "works for new pass manager. "))
static cl::opt< bool > ProfileSampleAccurate("profile-sample-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. "))
static cl::opt< bool > AllowRecursiveInline("sample-profile-recursive-inline", cl::Hidden, cl::desc("Allow sample loader inliner to inline recursive calls."))
static cl::opt< CallSiteFormat::Format > ProfileInlineReplayFormat("sample-profile-inline-replay-format", cl::init(CallSiteFormat::Format::LineColumnDiscriminator), cl::values(clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"), clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn", "<Line Number>:<Column Number>"), clEnumValN(CallSiteFormat::Format::LineDiscriminator, "LineDiscriminator", "<Line Number>.<Discriminator>"), clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, "LineColumnDiscriminator", "<Line Number>:<Column Number>.<Discriminator> (default)")), cl::desc("How sample profile inline replay file is formatted"), cl::Hidden)
static cl::opt< std::string > SampleProfileRemappingFile("sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden)
static bool skipProfileForFunction(const Function &F)
static cl::opt< bool > CallsitePrioritizedInline("sample-profile-prioritized-inline", cl::Hidden, cl::desc("Use call site prioritized inlining for sample profile loader." "Currently only CSSPGO is supported."))
This file provides the interface for the sampled PGO loader pass.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:348
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:500
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:214
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1259
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1481
This class represents a function call, abstracting a target machine's calling convention.
Debug location.
A debug info location.
Definition: DebugLoc.h:33
unsigned getLine() const
Definition: DebugLoc.cpp:24
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Diagnostic information for the sample profiler.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Represents either an error or a value T.
Definition: ErrorOr.h:56
Class to represent profile counts.
Definition: Function.h:277
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1828
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:274
static bool isAvailableExternallyLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:378
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:202
InlineResult is basically true or false.
Definition: InlineCost.h:179
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:658
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:452
bool extractProfTotalWeight(uint64_t &TotalVal) const
Retrieve total raw weight values of a branch.
Definition: Metadata.cpp:1742
const BasicBlock * getParent() const
Definition: Instruction.h:150
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1633
A smart pointer to a reference-counted object that inherits from RefCountedBase or ThreadSafeRefCount...
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
An analysis pass which computes the call graph for a module.
A node in the call graph.
A RefSCC of the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
VectorType::iterator erase(typename VectorType::iterator Iterator)
Remove the element given by Iterator.
Definition: MapVector.h:193
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Diagnostic information for optimization analysis remarks.
Diagnostic information for applied optimization remarks.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
Metadata * getMD(LLVMContext &Context, bool AddPartialField=true, bool AddPartialProfileRatioField=true)
Return summary information as metadata.
bool profileIsHashMismatched(const PseudoProbeDescriptor &FuncDesc, const FunctionSamples &Samples) const
bool moduleIsProbed(const Module &M) const
bool profileIsValid(const Function &F, const FunctionSamples &Samples) const
const PseudoProbeDescriptor * getDesc(uint64_t GUID) const
Sample profile inference pass.
void computeDominanceAndLoopInfo(FunctionT &F)
virtual ErrorOr< uint64_t > getInstWeight(const InstructionT &Inst)
Get the weight for an instruction.
virtual const FunctionSamples * findFunctionSamples(const InstructionT &I) const
Get the FunctionSamples for an instruction.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
SampleProfileLoaderPass(std::string File="", std::string RemappingFile="", ThinOrFullLTOPhase LTOPhase=ThinOrFullLTOPhase::None, IntrusiveRefCntPtr< vfs::FileSystem > FS=nullptr)
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
iterator end()
Definition: StringMap.h:221
iterator find(StringRef Key)
Definition: StringMap.h:234
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition: StringMap.h:367
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
iterator begin() const
Definition: StringRef.h:111
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:23
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
This class represents a function that is read from a sample profile.
Definition: FunctionId.h:36
Representation of the samples collected for a function.
Definition: SampleProf.h:744
void findInlinedFunctions(DenseSet< GlobalValue::GUID > &S, const HashKeyMap< std::unordered_map, FunctionId, Function * > &SymbolMap, uint64_t Threshold) const
Recursively traverses all children, if the total sample count of the corresponding function is no les...
Definition: SampleProf.h:1036
FunctionId getFunction() const
Return the function name.
Definition: SampleProf.h:1069
static StringRef getCanonicalFnName(const Function &F)
Return the canonical name for a function, taking into account suffix elision policy attributes.
Definition: SampleProf.h:1085
SampleContext & getContext() const
Definition: SampleProf.h:1185
sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight=1)
Merge the samples in Other into this one.
Definition: SampleProf.h:996
static LineLocation getCallSiteIdentifier(const DILocation *DIL, bool ProfileIsFS=false)
Returns a unique call site identifier for a given debug location of a call instruction.
Definition: SampleProf.cpp:221
uint64_t getHeadSamplesEstimate() const
Return an estimate of the sample count of the function entry basic block.
Definition: SampleProf.h:947
uint64_t getGUID() const
Return the GUID of the context's name.
Definition: SampleProf.h:1204
const BodySampleMap & getBodySamples() const
Return all the samples collected in the body of the function.
Definition: SampleProf.h:971
static bool UseMD5
Whether the profile uses MD5 to represent string.
Definition: SampleProf.h:1190
This class is a wrapper to associative container MapT<KeyT, ValueT> using the hash value of the origi...
Definition: HashKeyMap.h:53
static void flattenProfile(SampleProfileMap &ProfileMap, bool ProfileIsCS=false)
Definition: SampleProf.h:1417
bool hasAttribute(ContextAttributeMask A)
Definition: SampleProf.h:607
This class provides operator overloads to the map container using MD5 as the key type,...
Definition: SampleProf.h:1306
iterator find(const SampleContext &Ctx)
Definition: SampleProf.h:1317
Sample-based profile reader.
SampleProfileMap & getProfiles()
Return all the profiles.
bool profileIsProbeBased() const
Whether input profile is based on pseudo probes.
FunctionSamples * getSamplesFor(const Function &F)
Return the samples collected for function F.
bool profileIsPreInlined() const
Whether input profile contains ShouldBeInlined contexts.
std::error_code read()
The interface to read sample profiles from the associated file.
SampleProfileReaderItaniumRemapper * getRemapper()
ProfileSummary & getSummary() const
Return the profile summary.
virtual std::vector< FunctionId > * getNameTable()
It includes all the names that have samples either in outline instance or inline instance.
bool profileIsCS() const
Whether input profile is fully context-sensitive.
virtual void setSkipFlatProf(bool Skip)
Don't read profile without context if the flag is set.
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(const std::string Filename, LLVMContext &C, vfs::FileSystem &FS, FSDiscriminatorPass P=FSDiscriminatorPass::Base, const std::string RemapFilename="")
Create a sample profile reader appropriate to the file format.
virtual std::unique_ptr< ProfileSymbolList > getProfileSymbolList()
std::unordered_map< FunctionId, uint64_t > CallTargetMap
Definition: SampleProf.h:338
static const SortedCallTargetSet SortCallTargets(const CallTargetMap &Targets)
Sort call targets in descending order of call frequency.
Definition: SampleProf.h:406
static const CallTargetMap adjustCallTargets(const CallTargetMap &Targets, float DistributionFactor)
Prorate call targets by a distribution factor.
Definition: SampleProf.h:415
Enumerate the SCCs of a directed graph in reverse topological order of the SCC DAG.
Definition: SCCIterator.h:49
bool isAtEnd() const
Direct loop termination test which is more efficient than comparison with end().
Definition: SCCIterator.h:113
Sort the nodes of a directed SCC in the decreasing order of the edge weights.
Definition: SCCIterator.h:253
const CustomOperand< const MCSubtargetInfo & > Msg[]
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FS
Definition: X86.h:206
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
void checkExpectAnnotations(Instruction &I, const ArrayRef< uint32_t > ExistingWeights, bool IsFrontend)
checkExpectAnnotations - compares PGO counters to the thresholds used for llvm.expect and warns if th...
Definition: MisExpect.cpp:202
DenseMap< SymbolStringPtr, ExecutorSymbolDef > SymbolMap
A map from symbol names (as SymbolStringPtrs) to JITSymbols (address/flags pairs).
Definition: Core.h:121
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
static FunctionId getRepInFormat(StringRef Name)
Get the proper representation of a string according to whether the current Format uses MD5 to represe...
Definition: SampleProf.h:1292
std::unordered_map< LineLocation, LineLocation, LineLocationHash > LocToLocMap
Definition: SampleProf.h:737
std::map< FunctionId, FunctionSamples > FunctionSamplesMap
Definition: SampleProf.h:734
bool callsiteIsHot(const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI, bool ProfAccForSymsInList)
Return true if the given callsite is hot wrt to hot cutoff threshold.
IntrusiveRefCntPtr< FileSystem > getRealFileSystem()
Gets an vfs::FileSystem for the 'real' file system, as seen by the operating system.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], uint32_t &ActualNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst which is annotated with value profile meta data.
Definition: InstrProf.cpp:1219
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1689
cl::opt< int > ProfileInlineLimitMin
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
scc_iterator< T > scc_begin(const T &G)
Construct the begin iterator for a deduced graph type T.
Definition: SCCIterator.h:233
void setProbeDistributionFactor(Instruction &Inst, float Factor)
Definition: PseudoProbe.cpp:76
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
std::string AnnotateInlinePassName(InlineContext IC)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:76
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
cl::opt< bool > SampleProfileUseProfi
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1174
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1656
llvm::cl::opt< bool > UseIterativeBFIInference
std::optional< PseudoProbe > extractProbe(const Instruction &Inst)
Definition: PseudoProbe.cpp:56
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void emitInlinedIntoBasedOnCost(OptimizationRemarkEmitter &ORE, DebugLoc DLoc, const BasicBlock *Block, const Function &Callee, const Function &Caller, const InlineCost &IC, bool ForProfileContext=false, const char *PassName=nullptr)
Emit ORE message based in cost (default heuristic).
std::unique_ptr< InlineAdvisor > getReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context, std::unique_ptr< InlineAdvisor > OriginalAdvisor, const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks, InlineContext IC)
cl::opt< int > SampleHotCallSiteThreshold
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
void updateProfileCallee(Function *Callee, int64_t EntryDelta, const ValueMap< const Value *, WeakTrackingVH > *VMap=nullptr)
Updates profile information by adjusting the entry count by adding EntryDelta then scaling callsite i...
cl::opt< int > SampleColdCallSiteThreshold
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1858
@ DS_Warning
cl::opt< bool > SortProfiledSCC
cl::opt< int > ProfileInlineLimitMax
cl::opt< bool > EnableExtTspBlockPlacement
const uint64_t NOMORE_ICP_MAGICNUM
Magic number in the value profile metadata showing a target has been promoted for the instruction and...
Definition: Metadata.h:57
cl::opt< int > ProfileInlineGrowthLimit
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
Used in the streaming interface as the general argument type.
A wrapper of binary function with basic blocks and jumps.
Provides context on when an inline advisor is constructed in the pipeline (e.g., link phase,...
Definition: InlineAdvisor.h:59
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
std::optional< bool > ComputeFullInlineCost
Compute inline cost even when the cost has exceeded the threshold.
Definition: InlineCost.h:232
Represents the relative location of an instruction.
Definition: SampleProf.h:280