LLVM 17.0.0git
SampleProfile.cpp
Go to the documentation of this file.
1//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SampleProfileLoader transformation. This pass
10// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12// profile information in the given profile.
13//
14// This pass generates branch weight annotations on the IR:
15//
16// - prof: Represents branch weights. This annotation is added to branches
17// to indicate the weights of each edge coming out of the branch.
18// The weight of each edge is the weight of the target block for
19// that edge. The weight of a block B is computed as the maximum
20// number of samples found in B.
21//
22//===----------------------------------------------------------------------===//
23
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
28#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/StringMap.h"
34#include "llvm/ADT/StringRef.h"
35#include "llvm/ADT/Twine.h"
46#include "llvm/IR/BasicBlock.h"
47#include "llvm/IR/DebugLoc.h"
49#include "llvm/IR/Function.h"
50#include "llvm/IR/GlobalValue.h"
51#include "llvm/IR/InstrTypes.h"
52#include "llvm/IR/Instruction.h"
55#include "llvm/IR/LLVMContext.h"
56#include "llvm/IR/MDBuilder.h"
57#include "llvm/IR/Module.h"
58#include "llvm/IR/PassManager.h"
59#include "llvm/IR/PseudoProbe.h"
62#include "llvm/Pass.h"
68#include "llvm/Support/Debug.h"
72#include "llvm/Transforms/IPO.h"
82#include <algorithm>
83#include <cassert>
84#include <cstdint>
85#include <functional>
86#include <limits>
87#include <map>
88#include <memory>
89#include <queue>
90#include <string>
91#include <system_error>
92#include <utility>
93#include <vector>
94
95using namespace llvm;
96using namespace sampleprof;
97using namespace llvm::sampleprofutil;
99#define DEBUG_TYPE "sample-profile"
100#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
101
102STATISTIC(NumCSInlined,
103 "Number of functions inlined with context sensitive profile");
104STATISTIC(NumCSNotInlined,
105 "Number of functions not inlined with context sensitive profile");
106STATISTIC(NumMismatchedProfile,
107 "Number of functions with CFG mismatched profile");
108STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
109STATISTIC(NumDuplicatedInlinesite,
110 "Number of inlined callsites with a partial distribution factor");
111
112STATISTIC(NumCSInlinedHitMinLimit,
113 "Number of functions with FDO inline stopped due to min size limit");
114STATISTIC(NumCSInlinedHitMaxLimit,
115 "Number of functions with FDO inline stopped due to max size limit");
117 NumCSInlinedHitGrowthLimit,
118 "Number of functions with FDO inline stopped due to growth size limit");
119
120// Command line option to specify the file to read samples from. This is
121// mainly used for debugging.
123 "sample-profile-file", cl::init(""), cl::value_desc("filename"),
124 cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
125
126// The named file contains a set of transformations that may have been applied
127// to the symbol names between the program from which the sample data was
128// collected and the current program's symbols.
130 "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
131 cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
132
134 "report-profile-staleness", cl::Hidden, cl::init(false),
135 cl::desc("Compute and report stale profile statistical metrics."));
136
138 "persist-profile-staleness", cl::Hidden, cl::init(false),
139 cl::desc("Compute stale profile statistical metrics and write it into the "
140 "native object file(.llvm_stats section)."));
141
143 "profile-sample-accurate", cl::Hidden, cl::init(false),
144 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
145 "callsite and function as having 0 samples. Otherwise, treat "
146 "un-sampled callsites and functions conservatively as unknown. "));
147
149 "profile-sample-block-accurate", cl::Hidden, cl::init(false),
150 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
151 "branches and calls as having 0 samples. Otherwise, treat "
152 "them conservatively as unknown. "));
153
155 "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
156 cl::desc("For symbols in profile symbol list, regard their profiles to "
157 "be accurate. It may be overriden by profile-sample-accurate. "));
158
160 "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
161 cl::desc("Merge past inlinee's profile to outline version if sample "
162 "profile loader decided not to inline a call site. It will "
163 "only be enabled when top-down order of profile loading is "
164 "enabled. "));
165
167 "sample-profile-top-down-load", cl::Hidden, cl::init(true),
168 cl::desc("Do profile annotation and inlining for functions in top-down "
169 "order of call graph during sample profile loading. It only "
170 "works for new pass manager. "));
171
172static cl::opt<bool>
173 UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
174 cl::desc("Process functions in a top-down order "
175 "defined by the profiled call graph when "
176 "-sample-profile-top-down-load is on."));
177
179 "sample-profile-inline-size", cl::Hidden, cl::init(false),
180 cl::desc("Inline cold call sites in profile loader if it's beneficial "
181 "for code size."));
182
183// Since profiles are consumed by many passes, turning on this option has
184// side effects. For instance, pre-link SCC inliner would see merged profiles
185// and inline the hot functions (that are skipped in this pass).
187 "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
188 cl::desc("If true, artifically skip inline transformation in sample-loader "
189 "pass, and merge (or scale) profiles (as configured by "
190 "--sample-profile-merge-inlinee)."));
191
192namespace llvm {
194 SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
195 cl::desc("Sort profiled recursion by edge weights."));
196
198 "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
199 cl::desc("The size growth ratio limit for proirity-based sample profile "
200 "loader inlining."));
201
203 "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
204 cl::desc("The lower bound of size growth limit for "
205 "proirity-based sample profile loader inlining."));
206
208 "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
209 cl::desc("The upper bound of size growth limit for "
210 "proirity-based sample profile loader inlining."));
211
213 "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
214 cl::desc("Hot callsite threshold for proirity-based sample profile loader "
215 "inlining."));
216
218 "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
219 cl::desc("Threshold for inlining cold callsites"));
220} // namespace llvm
221
223 "sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25),
224 cl::desc(
225 "Relative hotness percentage threshold for indirect "
226 "call promotion in proirity-based sample profile loader inlining."));
227
229 "sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1),
230 cl::desc(
231 "Skip relative hotness check for ICP up to given number of targets."));
232
234 "sample-profile-prioritized-inline", cl::Hidden,
235
236 cl::desc("Use call site prioritized inlining for sample profile loader."
237 "Currently only CSSPGO is supported."));
238
240 "sample-profile-use-preinliner", cl::Hidden,
241
242 cl::desc("Use the preinliner decisions stored in profile context."));
243
245 "sample-profile-recursive-inline", cl::Hidden,
246
247 cl::desc("Allow sample loader inliner to inline recursive calls."));
248
250 "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
251 cl::desc(
252 "Optimization remarks file containing inline remarks to be replayed "
253 "by inlining from sample profile loader."),
254 cl::Hidden);
255
257 "sample-profile-inline-replay-scope",
258 cl::init(ReplayInlinerSettings::Scope::Function),
259 cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
260 "Replay on functions that have remarks associated "
261 "with them (default)"),
262 clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
263 "Replay on the entire module")),
264 cl::desc("Whether inline replay should be applied to the entire "
265 "Module or just the Functions (default) that are present as "
266 "callers in remarks during sample profile inlining."),
267 cl::Hidden);
268
270 "sample-profile-inline-replay-fallback",
271 cl::init(ReplayInlinerSettings::Fallback::Original),
274 ReplayInlinerSettings::Fallback::Original, "Original",
275 "All decisions not in replay send to original advisor (default)"),
276 clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
277 "AlwaysInline", "All decisions not in replay are inlined"),
278 clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
279 "All decisions not in replay are not inlined")),
280 cl::desc("How sample profile inline replay treats sites that don't come "
281 "from the replay. Original: defers to original advisor, "
282 "AlwaysInline: inline all sites not in replay, NeverInline: "
283 "inline no sites not in replay"),
284 cl::Hidden);
285
287 "sample-profile-inline-replay-format",
288 cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
290 clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
291 clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
292 "<Line Number>:<Column Number>"),
293 clEnumValN(CallSiteFormat::Format::LineDiscriminator,
294 "LineDiscriminator", "<Line Number>.<Discriminator>"),
295 clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
296 "LineColumnDiscriminator",
297 "<Line Number>:<Column Number>.<Discriminator> (default)")),
298 cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
299
301 MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
302 cl::desc("Max number of promotions for a single indirect "
303 "call callsite in sample profile loader"));
304
306 "overwrite-existing-weights", cl::Hidden, cl::init(false),
307 cl::desc("Ignore existing branch weights on IR and always overwrite."));
308
310 "annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),
311 cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "
312 "sample-profile inline pass name."));
313
314namespace llvm {
316}
317
318namespace {
319
320using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
321using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
322using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
323using EdgeWeightMap = DenseMap<Edge, uint64_t>;
324using BlockEdgeMap =
326
327class GUIDToFuncNameMapper {
328public:
329 GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
330 DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
331 : CurrentReader(Reader), CurrentModule(M),
332 CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
333 if (!CurrentReader.useMD5())
334 return;
335
336 for (const auto &F : CurrentModule) {
337 StringRef OrigName = F.getName();
338 CurrentGUIDToFuncNameMap.insert(
339 {Function::getGUID(OrigName), OrigName});
340
341 // Local to global var promotion used by optimization like thinlto
342 // will rename the var and add suffix like ".llvm.xxx" to the
343 // original local name. In sample profile, the suffixes of function
344 // names are all stripped. Since it is possible that the mapper is
345 // built in post-thin-link phase and var promotion has been done,
346 // we need to add the substring of function name without the suffix
347 // into the GUIDToFuncNameMap.
349 if (CanonName != OrigName)
350 CurrentGUIDToFuncNameMap.insert(
351 {Function::getGUID(CanonName), CanonName});
352 }
353
354 // Update GUIDToFuncNameMap for each function including inlinees.
355 SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
356 }
357
358 ~GUIDToFuncNameMapper() {
359 if (!CurrentReader.useMD5())
360 return;
361
362 CurrentGUIDToFuncNameMap.clear();
363
364 // Reset GUIDToFuncNameMap for of each function as they're no
365 // longer valid at this point.
366 SetGUIDToFuncNameMapForAll(nullptr);
367 }
368
369private:
370 void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
371 std::queue<FunctionSamples *> FSToUpdate;
372 for (auto &IFS : CurrentReader.getProfiles()) {
373 FSToUpdate.push(&IFS.second);
374 }
375
376 while (!FSToUpdate.empty()) {
377 FunctionSamples *FS = FSToUpdate.front();
378 FSToUpdate.pop();
379 FS->GUIDToFuncNameMap = Map;
380 for (const auto &ICS : FS->getCallsiteSamples()) {
381 const FunctionSamplesMap &FSMap = ICS.second;
382 for (const auto &IFS : FSMap) {
383 FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
384 FSToUpdate.push(&FS);
385 }
386 }
387 }
388 }
389
391 Module &CurrentModule;
392 DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
393};
394
395// Inline candidate used by iterative callsite prioritized inliner
396struct InlineCandidate {
397 CallBase *CallInstr;
398 const FunctionSamples *CalleeSamples;
399 // Prorated callsite count, which will be used to guide inlining. For example,
400 // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
401 // copies will get their own distribution factors and their prorated counts
402 // will be used to decide if they should be inlined independently.
403 uint64_t CallsiteCount;
404 // Call site distribution factor to prorate the profile samples for a
405 // duplicated callsite. Default value is 1.0.
406 float CallsiteDistribution;
407};
408
409// Inline candidate comparer using call site weight
410struct CandidateComparer {
411 bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
412 if (LHS.CallsiteCount != RHS.CallsiteCount)
413 return LHS.CallsiteCount < RHS.CallsiteCount;
414
415 const FunctionSamples *LCS = LHS.CalleeSamples;
416 const FunctionSamples *RCS = RHS.CalleeSamples;
417 assert(LCS && RCS && "Expect non-null FunctionSamples");
418
419 // Tie breaker using number of samples try to favor smaller functions first
420 if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
421 return LCS->getBodySamples().size() > RCS->getBodySamples().size();
422
423 // Tie breaker using GUID so we have stable/deterministic inlining order
424 return LCS->getGUID(LCS->getName()) < RCS->getGUID(RCS->getName());
425 }
426};
427
428using CandidateQueue =
430 CandidateComparer>;
431
432// Sample profile matching - fuzzy match.
433class SampleProfileMatcher {
434 Module &M;
435 SampleProfileReader &Reader;
436 const PseudoProbeManager *ProbeManager;
437
438 // Profile mismatching statstics.
439 uint64_t TotalProfiledCallsites = 0;
440 uint64_t NumMismatchedCallsites = 0;
441 uint64_t MismatchedCallsiteSamples = 0;
442 uint64_t TotalCallsiteSamples = 0;
443 uint64_t TotalProfiledFunc = 0;
444 uint64_t NumMismatchedFuncHash = 0;
445 uint64_t MismatchedFuncHashSamples = 0;
446 uint64_t TotalFuncHashSamples = 0;
447
448public:
449 SampleProfileMatcher(Module &M, SampleProfileReader &Reader,
450 const PseudoProbeManager *ProbeManager)
451 : M(M), Reader(Reader), ProbeManager(ProbeManager) {}
452 void detectProfileMismatch();
453 void detectProfileMismatch(const Function &F, const FunctionSamples &FS);
454};
455
456/// Sample profile pass.
457///
458/// This pass reads profile data from the file specified by
459/// -sample-profile-file and annotates every affected function with the
460/// profile information found in that file.
461class SampleProfileLoader final
462 : public SampleProfileLoaderBaseImpl<BasicBlock> {
463public:
464 SampleProfileLoader(
465 StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
467 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
468 std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
469 std::function<const TargetLibraryInfo &(Function &)> GetTLI)
471 std::move(FS)),
472 GetAC(std::move(GetAssumptionCache)),
473 GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
474 LTOPhase(LTOPhase),
475 AnnotatedPassName(AnnotateSampleProfileInlinePhase
478 : CSINLINE_DEBUG) {}
479
480 bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
481 bool runOnModule(Module &M, ModuleAnalysisManager *AM,
483
484protected:
486 bool emitAnnotations(Function &F);
488 ErrorOr<uint64_t> getProbeWeight(const Instruction &I);
489 const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
490 const FunctionSamples *
491 findFunctionSamples(const Instruction &I) const override;
492 std::vector<const FunctionSamples *>
493 findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
494 void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
495 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
496 const StringMap<Function *> &SymbolMap,
497 uint64_t Threshold);
498 // Attempt to promote indirect call and also inline the promoted call
499 bool tryPromoteAndInlineCandidate(
500 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
501 uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
502
503 bool inlineHotFunctions(Function &F,
504 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
505 std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
506 bool getExternalInlineAdvisorShouldInline(CallBase &CB);
507 InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
508 bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
509 bool
510 tryInlineCandidate(InlineCandidate &Candidate,
511 SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
512 bool
513 inlineHotFunctionsWithPriority(Function &F,
514 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
515 // Inline cold/small functions in addition to hot ones
516 bool shouldInlineColdCallee(CallBase &CallInst);
517 void emitOptimizationRemarksForInlineCandidates(
518 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
519 bool Hot);
520 void promoteMergeNotInlinedContextSamples(
522 const Function &F);
523 std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG);
524 std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M);
525 void generateMDProfMetadata(Function &F);
526
527 /// Map from function name to Function *. Used to find the function from
528 /// the function name. If the function name contains suffix, additional
529 /// entry is added to map from the stripped name to the function if there
530 /// is one-to-one mapping.
532
533 std::function<AssumptionCache &(Function &)> GetAC;
534 std::function<TargetTransformInfo &(Function &)> GetTTI;
535 std::function<const TargetLibraryInfo &(Function &)> GetTLI;
536
537 /// Profile tracker for different context.
538 std::unique_ptr<SampleContextTracker> ContextTracker;
539
540 /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
541 ///
542 /// We need to know the LTO phase because for example in ThinLTOPrelink
543 /// phase, in annotation, we should not promote indirect calls. Instead,
544 /// we will mark GUIDs that needs to be annotated to the function.
545 const ThinOrFullLTOPhase LTOPhase;
546 const std::string AnnotatedPassName;
547
548 /// Profle Symbol list tells whether a function name appears in the binary
549 /// used to generate the current profile.
550 std::unique_ptr<ProfileSymbolList> PSL;
551
552 /// Total number of samples collected in this profile.
553 ///
554 /// This is the sum of all the samples collected in all the functions executed
555 /// at runtime.
556 uint64_t TotalCollectedSamples = 0;
557
558 // Information recorded when we declined to inline a call site
559 // because we have determined it is too cold is accumulated for
560 // each callee function. Initially this is just the entry count.
561 struct NotInlinedProfileInfo {
562 uint64_t entryCount;
563 };
565
566 // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
567 // all the function symbols defined or declared in current module.
568 DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
569
570 // All the Names used in FunctionSamples including outline function
571 // names, inline instance names and call target names.
572 StringSet<> NamesInProfile;
573
574 // For symbol in profile symbol list, whether to regard their profiles
575 // to be accurate. It is mainly decided by existance of profile symbol
576 // list and -profile-accurate-for-symsinlist flag, but it can be
577 // overriden by -profile-sample-accurate or profile-sample-accurate
578 // attribute.
579 bool ProfAccForSymsInList;
580
581 // External inline advisor used to replay inline decision from remarks.
582 std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
583
584 // A pseudo probe helper to correlate the imported sample counts.
585 std::unique_ptr<PseudoProbeManager> ProbeManager;
586
587 // A helper to implement the sample profile matching algorithm.
588 std::unique_ptr<SampleProfileMatcher> MatchingManager;
589
590private:
591 const char *getAnnotatedRemarkPassName() const {
592 return AnnotatedPassName.c_str();
593 }
594};
595} // end anonymous namespace
596
597ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
599 return getProbeWeight(Inst);
600
601 const DebugLoc &DLoc = Inst.getDebugLoc();
602 if (!DLoc)
603 return std::error_code();
604
605 // Ignore all intrinsics, phinodes and branch instructions.
606 // Branch and phinodes instruction usually contains debug info from sources
607 // outside of the residing basic block, thus we ignore them during annotation.
608 if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
609 return std::error_code();
610
611 // For non-CS profile, if a direct call/invoke instruction is inlined in
612 // profile (findCalleeFunctionSamples returns non-empty result), but not
613 // inlined here, it means that the inlined callsite has no sample, thus the
614 // call instruction should have 0 count.
615 // For CS profile, the callsite count of previously inlined callees is
616 // populated with the entry count of the callees.
618 if (const auto *CB = dyn_cast<CallBase>(&Inst))
619 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
620 return 0;
621
622 return getInstWeightImpl(Inst);
623}
624
625// Here use error_code to represent: 1) The dangling probe. 2) Ignore the weight
626// of non-probe instruction. So if all instructions of the BB give error_code,
627// tell the inference algorithm to infer the BB weight.
628ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
630 "Profile is not pseudo probe based");
631 std::optional<PseudoProbe> Probe = extractProbe(Inst);
632 // Ignore the non-probe instruction. If none of the instruction in the BB is
633 // probe, we choose to infer the BB's weight.
634 if (!Probe)
635 return std::error_code();
636
637 const FunctionSamples *FS = findFunctionSamples(Inst);
638 // If none of the instruction has FunctionSample, we choose to return zero
639 // value sample to indicate the BB is cold. This could happen when the
640 // instruction is from inlinee and no profile data is found.
641 // FIXME: This should not be affected by the source drift issue as 1) if the
642 // newly added function is top-level inliner, it won't match the CFG checksum
643 // in the function profile or 2) if it's the inlinee, the inlinee should have
644 // a profile, otherwise it wouldn't be inlined. For non-probe based profile,
645 // we can improve it by adding a switch for profile-sample-block-accurate for
646 // block level counts in the future.
647 if (!FS)
648 return 0;
649
650 // For non-CS profile, If a direct call/invoke instruction is inlined in
651 // profile (findCalleeFunctionSamples returns non-empty result), but not
652 // inlined here, it means that the inlined callsite has no sample, thus the
653 // call instruction should have 0 count.
654 // For CS profile, the callsite count of previously inlined callees is
655 // populated with the entry count of the callees.
657 if (const auto *CB = dyn_cast<CallBase>(&Inst))
658 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
659 return 0;
660
661 const ErrorOr<uint64_t> &R = FS->findSamplesAt(Probe->Id, 0);
662 if (R) {
663 uint64_t Samples = R.get() * Probe->Factor;
664 bool FirstMark = CoverageTracker.markSamplesUsed(FS, Probe->Id, 0, Samples);
665 if (FirstMark) {
666 ORE->emit([&]() {
667 OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
668 Remark << "Applied " << ore::NV("NumSamples", Samples);
669 Remark << " samples from profile (ProbeId=";
670 Remark << ore::NV("ProbeId", Probe->Id);
671 Remark << ", Factor=";
672 Remark << ore::NV("Factor", Probe->Factor);
673 Remark << ", OriginalSamples=";
674 Remark << ore::NV("OriginalSamples", R.get());
675 Remark << ")";
676 return Remark;
677 });
678 }
679 LLVM_DEBUG(dbgs() << " " << Probe->Id << ":" << Inst
680 << " - weight: " << R.get() << " - factor: "
681 << format("%0.2f", Probe->Factor) << ")\n");
682 return Samples;
683 }
684 return R;
685}
686
687/// Get the FunctionSamples for a call instruction.
688///
689/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
690/// instance in which that call instruction is calling to. It contains
691/// all samples that resides in the inlined instance. We first find the
692/// inlined instance in which the call instruction is from, then we
693/// traverse its children to find the callsite with the matching
694/// location.
695///
696/// \param Inst Call/Invoke instruction to query.
697///
698/// \returns The FunctionSamples pointer to the inlined instance.
699const FunctionSamples *
700SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
701 const DILocation *DIL = Inst.getDebugLoc();
702 if (!DIL) {
703 return nullptr;
704 }
705
706 StringRef CalleeName;
707 if (Function *Callee = Inst.getCalledFunction())
708 CalleeName = Callee->getName();
709
711 return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
712
713 const FunctionSamples *FS = findFunctionSamples(Inst);
714 if (FS == nullptr)
715 return nullptr;
716
717 return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
718 CalleeName, Reader->getRemapper());
719}
720
721/// Returns a vector of FunctionSamples that are the indirect call targets
722/// of \p Inst. The vector is sorted by the total number of samples. Stores
723/// the total call count of the indirect call in \p Sum.
724std::vector<const FunctionSamples *>
725SampleProfileLoader::findIndirectCallFunctionSamples(
726 const Instruction &Inst, uint64_t &Sum) const {
727 const DILocation *DIL = Inst.getDebugLoc();
728 std::vector<const FunctionSamples *> R;
729
730 if (!DIL) {
731 return R;
732 }
733
734 auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
735 assert(L && R && "Expect non-null FunctionSamples");
736 if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())
737 return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();
738 return FunctionSamples::getGUID(L->getName()) <
739 FunctionSamples::getGUID(R->getName());
740 };
741
743 auto CalleeSamples =
744 ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
745 if (CalleeSamples.empty())
746 return R;
747
748 // For CSSPGO, we only use target context profile's entry count
749 // as that already includes both inlined callee and non-inlined ones..
750 Sum = 0;
751 for (const auto *const FS : CalleeSamples) {
752 Sum += FS->getHeadSamplesEstimate();
753 R.push_back(FS);
754 }
755 llvm::sort(R, FSCompare);
756 return R;
757 }
758
759 const FunctionSamples *FS = findFunctionSamples(Inst);
760 if (FS == nullptr)
761 return R;
762
764 auto T = FS->findCallTargetMapAt(CallSite);
765 Sum = 0;
766 if (T)
767 for (const auto &T_C : T.get())
768 Sum += T_C.second;
769 if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
770 if (M->empty())
771 return R;
772 for (const auto &NameFS : *M) {
773 Sum += NameFS.second.getHeadSamplesEstimate();
774 R.push_back(&NameFS.second);
775 }
776 llvm::sort(R, FSCompare);
777 }
778 return R;
779}
780
781const FunctionSamples *
782SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
784 std::optional<PseudoProbe> Probe = extractProbe(Inst);
785 if (!Probe)
786 return nullptr;
787 }
788
789 const DILocation *DIL = Inst.getDebugLoc();
790 if (!DIL)
791 return Samples;
792
793 auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
794 if (it.second) {
796 it.first->second = ContextTracker->getContextSamplesFor(DIL);
797 else
798 it.first->second =
799 Samples->findFunctionSamples(DIL, Reader->getRemapper());
800 }
801 return it.first->second;
802}
803
804/// Check whether the indirect call promotion history of \p Inst allows
805/// the promotion for \p Candidate.
806/// If the profile count for the promotion candidate \p Candidate is
807/// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
808/// for \p Inst. If we already have at least MaxNumPromotions
809/// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
810/// cannot promote for \p Inst anymore.
811static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
812 uint32_t NumVals = 0;
813 uint64_t TotalCount = 0;
814 std::unique_ptr<InstrProfValueData[]> ValueData =
815 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
816 bool Valid =
817 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
818 ValueData.get(), NumVals, TotalCount, true);
819 // No valid value profile so no promoted targets have been recorded
820 // before. Ok to do ICP.
821 if (!Valid)
822 return true;
823
824 unsigned NumPromoted = 0;
825 for (uint32_t I = 0; I < NumVals; I++) {
826 if (ValueData[I].Count != NOMORE_ICP_MAGICNUM)
827 continue;
828
829 // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
830 // metadata, it means the candidate has been promoted for this
831 // indirect call.
832 if (ValueData[I].Value == Function::getGUID(Candidate))
833 return false;
834 NumPromoted++;
835 // If already have MaxNumPromotions promotion, don't do it anymore.
836 if (NumPromoted == MaxNumPromotions)
837 return false;
838 }
839 return true;
840}
841
842/// Update indirect call target profile metadata for \p Inst.
843/// Usually \p Sum is the sum of counts of all the targets for \p Inst.
844/// If it is 0, it means updateIDTMetaData is used to mark a
845/// certain target to be promoted already. If it is not zero,
846/// we expect to use it to update the total count in the value profile.
847static void
849 const SmallVectorImpl<InstrProfValueData> &CallTargets,
850 uint64_t Sum) {
851 // Bail out early if MaxNumPromotions is zero.
852 // This prevents allocating an array of zero length below.
853 //
854 // Note `updateIDTMetaData` is called in two places so check
855 // `MaxNumPromotions` inside it.
856 if (MaxNumPromotions == 0)
857 return;
858 uint32_t NumVals = 0;
859 // OldSum is the existing total count in the value profile data.
860 uint64_t OldSum = 0;
861 std::unique_ptr<InstrProfValueData[]> ValueData =
862 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
863 bool Valid =
864 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
865 ValueData.get(), NumVals, OldSum, true);
866
867 DenseMap<uint64_t, uint64_t> ValueCountMap;
868 if (Sum == 0) {
869 assert((CallTargets.size() == 1 &&
870 CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&
871 "If sum is 0, assume only one element in CallTargets "
872 "with count being NOMORE_ICP_MAGICNUM");
873 // Initialize ValueCountMap with existing value profile data.
874 if (Valid) {
875 for (uint32_t I = 0; I < NumVals; I++)
876 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
877 }
878 auto Pair =
879 ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
880 // If the target already exists in value profile, decrease the total
881 // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.
882 if (!Pair.second) {
883 OldSum -= Pair.first->second;
884 Pair.first->second = NOMORE_ICP_MAGICNUM;
885 }
886 Sum = OldSum;
887 } else {
888 // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
889 // counts in the value profile.
890 if (Valid) {
891 for (uint32_t I = 0; I < NumVals; I++) {
892 if (ValueData[I].Count == NOMORE_ICP_MAGICNUM)
893 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
894 }
895 }
896
897 for (const auto &Data : CallTargets) {
898 auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
899 if (Pair.second)
900 continue;
901 // The target represented by Data.Value has already been promoted.
902 // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease
903 // Sum by Data.Count.
904 assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
905 Sum -= Data.Count;
906 }
907 }
908
910 for (const auto &ValueCount : ValueCountMap) {
911 NewCallTargets.emplace_back(
912 InstrProfValueData{ValueCount.first, ValueCount.second});
913 }
914
915 llvm::sort(NewCallTargets,
916 [](const InstrProfValueData &L, const InstrProfValueData &R) {
917 if (L.Count != R.Count)
918 return L.Count > R.Count;
919 return L.Value > R.Value;
920 });
921
922 uint32_t MaxMDCount =
923 std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
925 NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);
926}
927
928/// Attempt to promote indirect call and also inline the promoted call.
929///
930/// \param F Caller function.
931/// \param Candidate ICP and inline candidate.
932/// \param SumOrigin Original sum of target counts for indirect call before
933/// promoting given candidate.
934/// \param Sum Prorated sum of remaining target counts for indirect call
935/// after promoting given candidate.
936/// \param InlinedCallSite Output vector for new call sites exposed after
937/// inlining.
938bool SampleProfileLoader::tryPromoteAndInlineCandidate(
939 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
940 SmallVector<CallBase *, 8> *InlinedCallSite) {
941 // Bail out early if sample-loader inliner is disabled.
943 return false;
944
945 // Bail out early if MaxNumPromotions is zero.
946 // This prevents allocating an array of zero length in callees below.
947 if (MaxNumPromotions == 0)
948 return false;
949 auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName();
950 auto R = SymbolMap.find(CalleeFunctionName);
951 if (R == SymbolMap.end() || !R->getValue())
952 return false;
953
954 auto &CI = *Candidate.CallInstr;
955 if (!doesHistoryAllowICP(CI, R->getValue()->getName()))
956 return false;
957
958 const char *Reason = "Callee function not available";
959 // R->getValue() != &F is to prevent promoting a recursive call.
960 // If it is a recursive call, we do not inline it as it could bloat
961 // the code exponentially. There is way to better handle this, e.g.
962 // clone the caller first, and inline the cloned caller if it is
963 // recursive. As llvm does not inline recursive calls, we will
964 // simply ignore it instead of handling it explicitly.
965 if (!R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
966 R->getValue()->hasFnAttribute("use-sample-profile") &&
967 R->getValue() != &F && isLegalToPromote(CI, R->getValue(), &Reason)) {
968 // For promoted target, set its value with NOMORE_ICP_MAGICNUM count
969 // in the value profile metadata so the target won't be promoted again.
970 SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
971 Function::getGUID(R->getValue()->getName()), NOMORE_ICP_MAGICNUM}};
972 updateIDTMetaData(CI, SortedCallTargets, 0);
973
974 auto *DI = &pgo::promoteIndirectCall(
975 CI, R->getValue(), Candidate.CallsiteCount, Sum, false, ORE);
976 if (DI) {
977 Sum -= Candidate.CallsiteCount;
978 // Do not prorate the indirect callsite distribution since the original
979 // distribution will be used to scale down non-promoted profile target
980 // counts later. By doing this we lose track of the real callsite count
981 // for the leftover indirect callsite as a trade off for accurate call
982 // target counts.
983 // TODO: Ideally we would have two separate factors, one for call site
984 // counts and one is used to prorate call target counts.
985 // Do not update the promoted direct callsite distribution at this
986 // point since the original distribution combined with the callee profile
987 // will be used to prorate callsites from the callee if inlined. Once not
988 // inlined, the direct callsite distribution should be prorated so that
989 // the it will reflect the real callsite counts.
990 Candidate.CallInstr = DI;
991 if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
992 bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
993 if (!Inlined) {
994 // Prorate the direct callsite distribution so that it reflects real
995 // callsite counts.
997 *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
998 }
999 return Inlined;
1000 }
1001 }
1002 } else {
1003 LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
1004 << Candidate.CalleeSamples->getFuncName() << " because "
1005 << Reason << "\n");
1006 }
1007 return false;
1008}
1009
1010bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
1011 if (!ProfileSizeInline)
1012 return false;
1013
1015 if (Callee == nullptr)
1016 return false;
1017
1019 GetAC, GetTLI);
1020
1021 if (Cost.isNever())
1022 return false;
1023
1024 if (Cost.isAlways())
1025 return true;
1026
1027 return Cost.getCost() <= SampleColdCallSiteThreshold;
1028}
1029
1030void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
1031 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
1032 bool Hot) {
1033 for (auto *I : Candidates) {
1034 Function *CalledFunction = I->getCalledFunction();
1035 if (CalledFunction) {
1036 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1037 "InlineAttempt", I->getDebugLoc(),
1038 I->getParent())
1039 << "previous inlining reattempted for "
1040 << (Hot ? "hotness: '" : "size: '")
1041 << ore::NV("Callee", CalledFunction) << "' into '"
1042 << ore::NV("Caller", &F) << "'");
1043 }
1044 }
1045}
1046
1047void SampleProfileLoader::findExternalInlineCandidate(
1048 CallBase *CB, const FunctionSamples *Samples,
1049 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
1050 const StringMap<Function *> &SymbolMap, uint64_t Threshold) {
1051
1052 // If ExternalInlineAdvisor wants to inline an external function
1053 // make sure it's imported
1054 if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
1055 // Samples may not exist for replayed function, if so
1056 // just add the direct GUID and move on
1057 if (!Samples) {
1058 InlinedGUIDs.insert(
1060 return;
1061 }
1062 // Otherwise, drop the threshold to import everything that we can
1063 Threshold = 0;
1064 }
1065
1066 assert(Samples && "expect non-null caller profile");
1067
1068 // For AutoFDO profile, retrieve candidate profiles by walking over
1069 // the nested inlinee profiles.
1071 Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
1072 return;
1073 }
1074
1075 ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);
1076 std::queue<ContextTrieNode *> CalleeList;
1077 CalleeList.push(Caller);
1078 while (!CalleeList.empty()) {
1079 ContextTrieNode *Node = CalleeList.front();
1080 CalleeList.pop();
1081 FunctionSamples *CalleeSample = Node->getFunctionSamples();
1082 // For CSSPGO profile, retrieve candidate profile by walking over the
1083 // trie built for context profile. Note that also take call targets
1084 // even if callee doesn't have a corresponding context profile.
1085 if (!CalleeSample)
1086 continue;
1087
1088 // If pre-inliner decision is used, honor that for importing as well.
1089 bool PreInline =
1092 if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)
1093 continue;
1094
1095 StringRef Name = CalleeSample->getFuncName();
1097 // Add to the import list only when it's defined out of module.
1098 if (!Func || Func->isDeclaration())
1099 InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeSample->getName()));
1100
1101 // Import hot CallTargets, which may not be available in IR because full
1102 // profile annotation cannot be done until backend compilation in ThinLTO.
1103 for (const auto &BS : CalleeSample->getBodySamples())
1104 for (const auto &TS : BS.second.getCallTargets())
1105 if (TS.getValue() > Threshold) {
1106 StringRef CalleeName = CalleeSample->getFuncName(TS.getKey());
1107 const Function *Callee = SymbolMap.lookup(CalleeName);
1108 if (!Callee || Callee->isDeclaration())
1109 InlinedGUIDs.insert(FunctionSamples::getGUID(TS.getKey()));
1110 }
1111
1112 // Import hot child context profile associted with callees. Note that this
1113 // may have some overlap with the call target loop above, but doing this
1114 // based child context profile again effectively allow us to use the max of
1115 // entry count and call target count to determine importing.
1116 for (auto &Child : Node->getAllChildContext()) {
1117 ContextTrieNode *CalleeNode = &Child.second;
1118 CalleeList.push(CalleeNode);
1119 }
1120 }
1121}
1122
1123/// Iteratively inline hot callsites of a function.
1124///
1125/// Iteratively traverse all callsites of the function \p F, so as to
1126/// find out callsites with corresponding inline instances.
1127///
1128/// For such callsites,
1129/// - If it is hot enough, inline the callsites and adds callsites of the callee
1130/// into the caller. If the call is an indirect call, first promote
1131/// it to direct call. Each indirect call is limited with a single target.
1132///
1133/// - If a callsite is not inlined, merge the its profile to the outline
1134/// version (if --sample-profile-merge-inlinee is true), or scale the
1135/// counters of standalone function based on the profile of inlined
1136/// instances (if --sample-profile-merge-inlinee is false).
1137///
1138/// Later passes may consume the updated profiles.
1139///
1140/// \param F function to perform iterative inlining.
1141/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
1142/// inlined in the profiled binary.
1143///
1144/// \returns True if there is any inline happened.
1145bool SampleProfileLoader::inlineHotFunctions(
1146 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1147 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1148 // Profile symbol list is ignored when profile-sample-accurate is on.
1149 assert((!ProfAccForSymsInList ||
1151 !F.hasFnAttribute("profile-sample-accurate"))) &&
1152 "ProfAccForSymsInList should be false when profile-sample-accurate "
1153 "is enabled");
1154
1155 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1156 bool Changed = false;
1157 bool LocalChanged = true;
1158 while (LocalChanged) {
1159 LocalChanged = false;
1161 for (auto &BB : F) {
1162 bool Hot = false;
1163 SmallVector<CallBase *, 10> AllCandidates;
1164 SmallVector<CallBase *, 10> ColdCandidates;
1165 for (auto &I : BB) {
1166 const FunctionSamples *FS = nullptr;
1167 if (auto *CB = dyn_cast<CallBase>(&I)) {
1168 if (!isa<IntrinsicInst>(I)) {
1169 if ((FS = findCalleeFunctionSamples(*CB))) {
1170 assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
1171 "GUIDToFuncNameMap has to be populated");
1172 AllCandidates.push_back(CB);
1173 if (FS->getHeadSamplesEstimate() > 0 ||
1175 LocalNotInlinedCallSites.insert({CB, FS});
1176 if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1177 Hot = true;
1178 else if (shouldInlineColdCallee(*CB))
1179 ColdCandidates.push_back(CB);
1180 } else if (getExternalInlineAdvisorShouldInline(*CB)) {
1181 AllCandidates.push_back(CB);
1182 }
1183 }
1184 }
1185 }
1186 if (Hot || ExternalInlineAdvisor) {
1187 CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
1188 emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
1189 } else {
1190 CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
1191 emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
1192 }
1193 }
1194 for (CallBase *I : CIS) {
1195 Function *CalledFunction = I->getCalledFunction();
1196 InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
1197 0 /* dummy count */,
1198 1.0 /* dummy distribution factor */};
1199 // Do not inline recursive calls.
1200 if (CalledFunction == &F)
1201 continue;
1202 if (I->isIndirectCall()) {
1203 uint64_t Sum;
1204 for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
1205 uint64_t SumOrigin = Sum;
1206 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1207 findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
1208 PSI->getOrCompHotCountThreshold());
1209 continue;
1210 }
1211 if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1212 continue;
1213
1214 Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};
1215 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
1216 LocalNotInlinedCallSites.erase(I);
1217 LocalChanged = true;
1218 }
1219 }
1220 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1221 !CalledFunction->isDeclaration()) {
1222 if (tryInlineCandidate(Candidate)) {
1223 LocalNotInlinedCallSites.erase(I);
1224 LocalChanged = true;
1225 }
1226 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1227 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1228 InlinedGUIDs, SymbolMap,
1229 PSI->getOrCompHotCountThreshold());
1230 }
1231 }
1232 Changed |= LocalChanged;
1233 }
1234
1235 // For CS profile, profile for not inlined context will be merged when
1236 // base profile is being retrieved.
1238 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1239 return Changed;
1240}
1241
1242bool SampleProfileLoader::tryInlineCandidate(
1243 InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
1244 // Do not attempt to inline a candidate if
1245 // --disable-sample-loader-inlining is true.
1247 return false;
1248
1249 CallBase &CB = *Candidate.CallInstr;
1250 Function *CalledFunction = CB.getCalledFunction();
1251 assert(CalledFunction && "Expect a callee with definition");
1252 DebugLoc DLoc = CB.getDebugLoc();
1253 BasicBlock *BB = CB.getParent();
1254
1255 InlineCost Cost = shouldInlineCandidate(Candidate);
1256 if (Cost.isNever()) {
1257 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1258 "InlineFail", DLoc, BB)
1259 << "incompatible inlining");
1260 return false;
1261 }
1262
1263 if (!Cost)
1264 return false;
1265
1266 InlineFunctionInfo IFI(nullptr, GetAC);
1267 IFI.UpdateProfile = false;
1268 InlineResult IR = InlineFunction(CB, IFI,
1269 /*MergeAttributes=*/true);
1270 if (!IR.isSuccess())
1271 return false;
1272
1273 // The call to InlineFunction erases I, so we can't pass it here.
1274 emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),
1275 Cost, true, getAnnotatedRemarkPassName());
1276
1277 // Now populate the list of newly exposed call sites.
1278 if (InlinedCallSites) {
1279 InlinedCallSites->clear();
1280 for (auto &I : IFI.InlinedCallSites)
1281 InlinedCallSites->push_back(I);
1282 }
1283
1285 ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
1286 ++NumCSInlined;
1287
1288 // Prorate inlined probes for a duplicated inlining callsite which probably
1289 // has a distribution less than 100%. Samples for an inlinee should be
1290 // distributed among the copies of the original callsite based on each
1291 // callsite's distribution factor for counts accuracy. Note that an inlined
1292 // probe may come with its own distribution factor if it has been duplicated
1293 // in the inlinee body. The two factor are multiplied to reflect the
1294 // aggregation of duplication.
1295 if (Candidate.CallsiteDistribution < 1) {
1296 for (auto &I : IFI.InlinedCallSites) {
1297 if (std::optional<PseudoProbe> Probe = extractProbe(*I))
1298 setProbeDistributionFactor(*I, Probe->Factor *
1299 Candidate.CallsiteDistribution);
1300 }
1301 NumDuplicatedInlinesite++;
1302 }
1303
1304 return true;
1305}
1306
1307bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
1308 CallBase *CB) {
1309 assert(CB && "Expect non-null call instruction");
1310
1311 if (isa<IntrinsicInst>(CB))
1312 return false;
1313
1314 // Find the callee's profile. For indirect call, find hottest target profile.
1315 const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
1316 // If ExternalInlineAdvisor wants to inline this site, do so even
1317 // if Samples are not present.
1318 if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
1319 return false;
1320
1321 float Factor = 1.0;
1322 if (std::optional<PseudoProbe> Probe = extractProbe(*CB))
1323 Factor = Probe->Factor;
1324
1325 uint64_t CallsiteCount =
1326 CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;
1327 *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
1328 return true;
1329}
1330
1331std::optional<InlineCost>
1332SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
1333 std::unique_ptr<InlineAdvice> Advice = nullptr;
1334 if (ExternalInlineAdvisor) {
1335 Advice = ExternalInlineAdvisor->getAdvice(CB);
1336 if (Advice) {
1337 if (!Advice->isInliningRecommended()) {
1338 Advice->recordUnattemptedInlining();
1339 return InlineCost::getNever("not previously inlined");
1340 }
1341 Advice->recordInlining();
1342 return InlineCost::getAlways("previously inlined");
1343 }
1344 }
1345
1346 return {};
1347}
1348
1349bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
1350 std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
1351 return Cost ? !!*Cost : false;
1352}
1353
1355SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
1356 if (std::optional<InlineCost> ReplayCost =
1357 getExternalInlineAdvisorCost(*Candidate.CallInstr))
1358 return *ReplayCost;
1359 // Adjust threshold based on call site hotness, only do this for callsite
1360 // prioritized inliner because otherwise cost-benefit check is done earlier.
1361 int SampleThreshold = SampleColdCallSiteThreshold;
1363 if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
1364 SampleThreshold = SampleHotCallSiteThreshold;
1365 else if (!ProfileSizeInline)
1366 return InlineCost::getNever("cold callsite");
1367 }
1368
1369 Function *Callee = Candidate.CallInstr->getCalledFunction();
1370 assert(Callee && "Expect a definition for inline candidate of direct call");
1371
1372 InlineParams Params = getInlineParams();
1373 // We will ignore the threshold from inline cost, so always get full cost.
1374 Params.ComputeFullInlineCost = true;
1376 // Checks if there is anything in the reachable portion of the callee at
1377 // this callsite that makes this inlining potentially illegal. Need to
1378 // set ComputeFullInlineCost, otherwise getInlineCost may return early
1379 // when cost exceeds threshold without checking all IRs in the callee.
1380 // The acutal cost does not matter because we only checks isNever() to
1381 // see if it is legal to inline the callsite.
1382 InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
1383 GetTTI(*Callee), GetAC, GetTLI);
1384
1385 // Honor always inline and never inline from call analyzer
1386 if (Cost.isNever() || Cost.isAlways())
1387 return Cost;
1388
1389 // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
1390 // decisions based on hotness as well as accurate function byte sizes for
1391 // given context using function/inlinee sizes from previous build. It
1392 // stores the decision in profile, and also adjust/merge context profile
1393 // aiming at better context-sensitive post-inline profile quality, assuming
1394 // all inline decision estimates are going to be honored by compiler. Here
1395 // we replay that inline decision under `sample-profile-use-preinliner`.
1396 // Note that we don't need to handle negative decision from preinliner as
1397 // context profile for not inlined calls are merged by preinliner already.
1398 if (UsePreInlinerDecision && Candidate.CalleeSamples) {
1399 // Once two node are merged due to promotion, we're losing some context
1400 // so the original context-sensitive preinliner decision should be ignored
1401 // for SyntheticContext.
1402 SampleContext &Context = Candidate.CalleeSamples->getContext();
1403 if (!Context.hasState(SyntheticContext) &&
1404 Context.hasAttribute(ContextShouldBeInlined))
1405 return InlineCost::getAlways("preinliner");
1406 }
1407
1408 // For old FDO inliner, we inline the call site as long as cost is not
1409 // "Never". The cost-benefit check is done earlier.
1411 return InlineCost::get(Cost.getCost(), INT_MAX);
1412 }
1413
1414 // Otherwise only use the cost from call analyzer, but overwite threshold with
1415 // Sample PGO threshold.
1416 return InlineCost::get(Cost.getCost(), SampleThreshold);
1417}
1418
1419bool SampleProfileLoader::inlineHotFunctionsWithPriority(
1420 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1421 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1422 // Profile symbol list is ignored when profile-sample-accurate is on.
1423 assert((!ProfAccForSymsInList ||
1425 !F.hasFnAttribute("profile-sample-accurate"))) &&
1426 "ProfAccForSymsInList should be false when profile-sample-accurate "
1427 "is enabled");
1428
1429 // Populating worklist with initial call sites from root inliner, along
1430 // with call site weights.
1431 CandidateQueue CQueue;
1432 InlineCandidate NewCandidate;
1433 for (auto &BB : F) {
1434 for (auto &I : BB) {
1435 auto *CB = dyn_cast<CallBase>(&I);
1436 if (!CB)
1437 continue;
1438 if (getInlineCandidate(&NewCandidate, CB))
1439 CQueue.push(NewCandidate);
1440 }
1441 }
1442
1443 // Cap the size growth from profile guided inlining. This is needed even
1444 // though cost of each inline candidate already accounts for callee size,
1445 // because with top-down inlining, we can grow inliner size significantly
1446 // with large number of smaller inlinees each pass the cost check.
1448 "Max inline size limit should not be smaller than min inline size "
1449 "limit.");
1450 unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
1451 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
1452 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
1453 if (ExternalInlineAdvisor)
1454 SizeLimit = std::numeric_limits<unsigned>::max();
1455
1456 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1457
1458 // Perform iterative BFS call site prioritized inlining
1459 bool Changed = false;
1460 while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
1461 InlineCandidate Candidate = CQueue.top();
1462 CQueue.pop();
1463 CallBase *I = Candidate.CallInstr;
1464 Function *CalledFunction = I->getCalledFunction();
1465
1466 if (CalledFunction == &F)
1467 continue;
1468 if (I->isIndirectCall()) {
1469 uint64_t Sum = 0;
1470 auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
1471 uint64_t SumOrigin = Sum;
1472 Sum *= Candidate.CallsiteDistribution;
1473 unsigned ICPCount = 0;
1474 for (const auto *FS : CalleeSamples) {
1475 // TODO: Consider disable pre-lTO ICP for MonoLTO as well
1476 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1477 findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
1478 PSI->getOrCompHotCountThreshold());
1479 continue;
1480 }
1481 uint64_t EntryCountDistributed =
1482 FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;
1483 // In addition to regular inline cost check, we also need to make sure
1484 // ICP isn't introducing excessive speculative checks even if individual
1485 // target looks beneficial to promote and inline. That means we should
1486 // only do ICP when there's a small number dominant targets.
1487 if (ICPCount >= ProfileICPRelativeHotnessSkip &&
1488 EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness)
1489 break;
1490 // TODO: Fix CallAnalyzer to handle all indirect calls.
1491 // For indirect call, we don't run CallAnalyzer to get InlineCost
1492 // before actual inlining. This is because we could see two different
1493 // types from the same definition, which makes CallAnalyzer choke as
1494 // it's expecting matching parameter type on both caller and callee
1495 // side. See example from PR18962 for the triggering cases (the bug was
1496 // fixed, but we generate different types).
1497 if (!PSI->isHotCount(EntryCountDistributed))
1498 break;
1499 SmallVector<CallBase *, 8> InlinedCallSites;
1500 // Attach function profile for promoted indirect callee, and update
1501 // call site count for the promoted inline candidate too.
1502 Candidate = {I, FS, EntryCountDistributed,
1503 Candidate.CallsiteDistribution};
1504 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
1505 &InlinedCallSites)) {
1506 for (auto *CB : InlinedCallSites) {
1507 if (getInlineCandidate(&NewCandidate, CB))
1508 CQueue.emplace(NewCandidate);
1509 }
1510 ICPCount++;
1511 Changed = true;
1512 } else if (!ContextTracker) {
1513 LocalNotInlinedCallSites.insert({I, FS});
1514 }
1515 }
1516 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1517 !CalledFunction->isDeclaration()) {
1518 SmallVector<CallBase *, 8> InlinedCallSites;
1519 if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
1520 for (auto *CB : InlinedCallSites) {
1521 if (getInlineCandidate(&NewCandidate, CB))
1522 CQueue.emplace(NewCandidate);
1523 }
1524 Changed = true;
1525 } else if (!ContextTracker) {
1526 LocalNotInlinedCallSites.insert({I, Candidate.CalleeSamples});
1527 }
1528 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1529 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1530 InlinedGUIDs, SymbolMap,
1531 PSI->getOrCompHotCountThreshold());
1532 }
1533 }
1534
1535 if (!CQueue.empty()) {
1536 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
1537 ++NumCSInlinedHitMaxLimit;
1538 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
1539 ++NumCSInlinedHitMinLimit;
1540 else
1541 ++NumCSInlinedHitGrowthLimit;
1542 }
1543
1544 // For CS profile, profile for not inlined context will be merged when
1545 // base profile is being retrieved.
1547 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1548 return Changed;
1549}
1550
1551void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
1553 const Function &F) {
1554 // Accumulate not inlined callsite information into notInlinedSamples
1555 for (const auto &Pair : NonInlinedCallSites) {
1556 CallBase *I = Pair.first;
1557 Function *Callee = I->getCalledFunction();
1558 if (!Callee || Callee->isDeclaration())
1559 continue;
1560
1561 ORE->emit(
1562 OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",
1563 I->getDebugLoc(), I->getParent())
1564 << "previous inlining not repeated: '" << ore::NV("Callee", Callee)
1565 << "' into '" << ore::NV("Caller", &F) << "'");
1566
1567 ++NumCSNotInlined;
1568 const FunctionSamples *FS = Pair.second;
1569 if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {
1570 continue;
1571 }
1572
1573 // Do not merge a context that is already duplicated into the base profile.
1574 if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))
1575 continue;
1576
1577 if (ProfileMergeInlinee) {
1578 // A function call can be replicated by optimizations like callsite
1579 // splitting or jump threading and the replicates end up sharing the
1580 // sample nested callee profile instead of slicing the original
1581 // inlinee's profile. We want to do merge exactly once by filtering out
1582 // callee profiles with a non-zero head sample count.
1583 if (FS->getHeadSamples() == 0) {
1584 // Use entry samples as head samples during the merge, as inlinees
1585 // don't have head samples.
1586 const_cast<FunctionSamples *>(FS)->addHeadSamples(
1587 FS->getHeadSamplesEstimate());
1588
1589 // Note that we have to do the merge right after processing function.
1590 // This allows OutlineFS's profile to be used for annotation during
1591 // top-down processing of functions' annotation.
1592 FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
1593 OutlineFS->merge(*FS, 1);
1594 // Set outlined profile to be synthetic to not bias the inliner.
1595 OutlineFS->SetContextSynthetic();
1596 }
1597 } else {
1598 auto pair =
1599 notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
1600 pair.first->second.entryCount += FS->getHeadSamplesEstimate();
1601 }
1602 }
1603}
1604
1605/// Returns the sorted CallTargetMap \p M by count in descending order.
1609 for (const auto &I : SampleRecord::SortCallTargets(M)) {
1610 R.emplace_back(
1611 InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
1612 }
1613 return R;
1614}
1615
1616// Generate MD_prof metadata for every branch instruction using the
1617// edge weights computed during propagation.
1618void SampleProfileLoader::generateMDProfMetadata(Function &F) {
1619 // Generate MD_prof metadata for every branch instruction using the
1620 // edge weights computed during propagation.
1621 LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1622 LLVMContext &Ctx = F.getContext();
1623 MDBuilder MDB(Ctx);
1624 for (auto &BI : F) {
1625 BasicBlock *BB = &BI;
1626
1627 if (BlockWeights[BB]) {
1628 for (auto &I : *BB) {
1629 if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1630 continue;
1631 if (!cast<CallBase>(I).getCalledFunction()) {
1632 const DebugLoc &DLoc = I.getDebugLoc();
1633 if (!DLoc)
1634 continue;
1635 const DILocation *DIL = DLoc;
1636 const FunctionSamples *FS = findFunctionSamples(I);
1637 if (!FS)
1638 continue;
1640 auto T = FS->findCallTargetMapAt(CallSite);
1641 if (!T || T.get().empty())
1642 continue;
1644 // Prorate the callsite counts based on the pre-ICP distribution
1645 // factor to reflect what is already done to the callsite before
1646 // ICP, such as calliste cloning.
1647 if (std::optional<PseudoProbe> Probe = extractProbe(I)) {
1648 if (Probe->Factor < 1)
1649 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
1650 }
1651 }
1652 SmallVector<InstrProfValueData, 2> SortedCallTargets =
1654 uint64_t Sum = 0;
1655 for (const auto &C : T.get())
1656 Sum += C.second;
1657 // With CSSPGO all indirect call targets are counted torwards the
1658 // original indirect call site in the profile, including both
1659 // inlined and non-inlined targets.
1661 if (const FunctionSamplesMap *M =
1662 FS->findFunctionSamplesMapAt(CallSite)) {
1663 for (const auto &NameFS : *M)
1664 Sum += NameFS.second.getHeadSamplesEstimate();
1665 }
1666 }
1667 if (Sum)
1668 updateIDTMetaData(I, SortedCallTargets, Sum);
1669 else if (OverwriteExistingWeights)
1670 I.setMetadata(LLVMContext::MD_prof, nullptr);
1671 } else if (!isa<IntrinsicInst>(&I)) {
1672 I.setMetadata(LLVMContext::MD_prof,
1673 MDB.createBranchWeights(
1674 {static_cast<uint32_t>(BlockWeights[BB])}));
1675 }
1676 }
1678 // Set profile metadata (possibly annotated by LTO prelink) to zero or
1679 // clear it for cold code.
1680 for (auto &I : *BB) {
1681 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1682 if (cast<CallBase>(I).isIndirectCall())
1683 I.setMetadata(LLVMContext::MD_prof, nullptr);
1684 else
1685 I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(0));
1686 }
1687 }
1688 }
1689
1690 Instruction *TI = BB->getTerminator();
1691 if (TI->getNumSuccessors() == 1)
1692 continue;
1693 if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
1694 !isa<IndirectBrInst>(TI))
1695 continue;
1696
1697 DebugLoc BranchLoc = TI->getDebugLoc();
1698 LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1699 << ((BranchLoc) ? Twine(BranchLoc.getLine())
1700 : Twine("<UNKNOWN LOCATION>"))
1701 << ".\n");
1703 uint32_t MaxWeight = 0;
1704 Instruction *MaxDestInst;
1705 // Since profi treats multiple edges (multiway branches) as a single edge,
1706 // we need to distribute the computed weight among the branches. We do
1707 // this by evenly splitting the edge weight among destinations.
1709 std::vector<uint64_t> EdgeIndex;
1711 EdgeIndex.resize(TI->getNumSuccessors());
1712 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1713 const BasicBlock *Succ = TI->getSuccessor(I);
1714 EdgeIndex[I] = EdgeMultiplicity[Succ];
1715 EdgeMultiplicity[Succ]++;
1716 }
1717 }
1718 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1719 BasicBlock *Succ = TI->getSuccessor(I);
1720 Edge E = std::make_pair(BB, Succ);
1721 uint64_t Weight = EdgeWeights[E];
1722 LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1723 // Use uint32_t saturated arithmetic to adjust the incoming weights,
1724 // if needed. Sample counts in profiles are 64-bit unsigned values,
1725 // but internally branch weights are expressed as 32-bit values.
1726 if (Weight > std::numeric_limits<uint32_t>::max()) {
1727 LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1728 Weight = std::numeric_limits<uint32_t>::max();
1729 }
1730 if (!SampleProfileUseProfi) {
1731 // Weight is added by one to avoid propagation errors introduced by
1732 // 0 weights.
1733 Weights.push_back(static_cast<uint32_t>(Weight + 1));
1734 } else {
1735 // Profi creates proper weights that do not require "+1" adjustments but
1736 // we evenly split the weight among branches with the same destination.
1737 uint64_t W = Weight / EdgeMultiplicity[Succ];
1738 // Rounding up, if needed, so that first branches are hotter.
1739 if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
1740 W++;
1741 Weights.push_back(static_cast<uint32_t>(W));
1742 }
1743 if (Weight != 0) {
1744 if (Weight > MaxWeight) {
1745 MaxWeight = Weight;
1746 MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1747 }
1748 }
1749 }
1750
1751 misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
1752
1753 uint64_t TempWeight;
1754 // Only set weights if there is at least one non-zero weight.
1755 // In any other case, let the analyzer set weights.
1756 // Do not set weights if the weights are present unless under
1757 // OverwriteExistingWeights. In ThinLTO, the profile annotation is done
1758 // twice. If the first annotation already set the weights, the second pass
1759 // does not need to set it. With OverwriteExistingWeights, Blocks with zero
1760 // weight should have their existing metadata (possibly annotated by LTO
1761 // prelink) cleared.
1762 if (MaxWeight > 0 &&
1763 (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
1764 LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1765 TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
1766 ORE->emit([&]() {
1767 return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1768 << "most popular destination for conditional branches at "
1769 << ore::NV("CondBranchesLoc", BranchLoc);
1770 });
1771 } else {
1773 TI->setMetadata(LLVMContext::MD_prof, nullptr);
1774 LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
1775 } else {
1776 LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1777 }
1778 }
1779 }
1780}
1781
1782/// Once all the branch weights are computed, we emit the MD_prof
1783/// metadata on BB using the computed values for each of its branches.
1784///
1785/// \param F The function to query.
1786///
1787/// \returns true if \p F was modified. Returns false, otherwise.
1788bool SampleProfileLoader::emitAnnotations(Function &F) {
1789 bool Changed = false;
1790
1792 if (!ProbeManager->profileIsValid(F, *Samples)) {
1793 LLVM_DEBUG(
1794 dbgs() << "Profile is invalid due to CFG mismatch for Function "
1795 << F.getName());
1796 ++NumMismatchedProfile;
1797 return false;
1798 }
1799 ++NumMatchedProfile;
1800 } else {
1801 if (getFunctionLoc(F) == 0)
1802 return false;
1803
1804 LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1805 << F.getName() << ": " << getFunctionLoc(F) << "\n");
1806 }
1807
1808 DenseSet<GlobalValue::GUID> InlinedGUIDs;
1810 Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
1811 else
1812 Changed |= inlineHotFunctions(F, InlinedGUIDs);
1813
1814 Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
1815
1816 if (Changed)
1817 generateMDProfMetadata(F);
1818
1819 emitCoverageRemarks(F);
1820 return Changed;
1821}
1822
1823std::unique_ptr<ProfiledCallGraph>
1824SampleProfileLoader::buildProfiledCallGraph(Module &M) {
1825 std::unique_ptr<ProfiledCallGraph> ProfiledCG;
1827 ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
1828 else
1829 ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
1830
1831 // Add all functions into the profiled call graph even if they are not in
1832 // the profile. This makes sure functions missing from the profile still
1833 // gets a chance to be processed.
1834 for (Function &F : M) {
1835 if (F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"))
1836 continue;
1837 ProfiledCG->addProfiledFunction(FunctionSamples::getCanonicalFnName(F));
1838 }
1839
1840 return ProfiledCG;
1841}
1842
1843std::vector<Function *>
1844SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) {
1845 std::vector<Function *> FunctionOrderList;
1846 FunctionOrderList.reserve(M.size());
1847
1849 errs() << "WARNING: -use-profiled-call-graph ignored, should be used "
1850 "together with -sample-profile-top-down-load.\n";
1851
1852 if (!ProfileTopDownLoad) {
1853 if (ProfileMergeInlinee) {
1854 // Disable ProfileMergeInlinee if profile is not loaded in top down order,
1855 // because the profile for a function may be used for the profile
1856 // annotation of its outline copy before the profile merging of its
1857 // non-inlined inline instances, and that is not the way how
1858 // ProfileMergeInlinee is supposed to work.
1859 ProfileMergeInlinee = false;
1860 }
1861
1862 for (Function &F : M)
1863 if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
1864 FunctionOrderList.push_back(&F);
1865 return FunctionOrderList;
1866 }
1867
1869 !UseProfiledCallGraph.getNumOccurrences())) {
1870 // Use profiled call edges to augment the top-down order. There are cases
1871 // that the top-down order computed based on the static call graph doesn't
1872 // reflect real execution order. For example
1873 //
1874 // 1. Incomplete static call graph due to unknown indirect call targets.
1875 // Adjusting the order by considering indirect call edges from the
1876 // profile can enable the inlining of indirect call targets by allowing
1877 // the caller processed before them.
1878 // 2. Mutual call edges in an SCC. The static processing order computed for
1879 // an SCC may not reflect the call contexts in the context-sensitive
1880 // profile, thus may cause potential inlining to be overlooked. The
1881 // function order in one SCC is being adjusted to a top-down order based
1882 // on the profile to favor more inlining. This is only a problem with CS
1883 // profile.
1884 // 3. Transitive indirect call edges due to inlining. When a callee function
1885 // (say B) is inlined into into a caller function (say A) in LTO prelink,
1886 // every call edge originated from the callee B will be transferred to
1887 // the caller A. If any transferred edge (say A->C) is indirect, the
1888 // original profiled indirect edge B->C, even if considered, would not
1889 // enforce a top-down order from the caller A to the potential indirect
1890 // call target C in LTO postlink since the inlined callee B is gone from
1891 // the static call graph.
1892 // 4. #3 can happen even for direct call targets, due to functions defined
1893 // in header files. A header function (say A), when included into source
1894 // files, is defined multiple times but only one definition survives due
1895 // to ODR. Therefore, the LTO prelink inlining done on those dropped
1896 // definitions can be useless based on a local file scope. More
1897 // importantly, the inlinee (say B), once fully inlined to a
1898 // to-be-dropped A, will have no profile to consume when its outlined
1899 // version is compiled. This can lead to a profile-less prelink
1900 // compilation for the outlined version of B which may be called from
1901 // external modules. while this isn't easy to fix, we rely on the
1902 // postlink AutoFDO pipeline to optimize B. Since the survived copy of
1903 // the A can be inlined in its local scope in prelink, it may not exist
1904 // in the merged IR in postlink, and we'll need the profiled call edges
1905 // to enforce a top-down order for the rest of the functions.
1906 //
1907 // Considering those cases, a profiled call graph completely independent of
1908 // the static call graph is constructed based on profile data, where
1909 // function objects are not even needed to handle case #3 and case 4.
1910 //
1911 // Note that static callgraph edges are completely ignored since they
1912 // can be conflicting with profiled edges for cyclic SCCs and may result in
1913 // an SCC order incompatible with profile-defined one. Using strictly
1914 // profile order ensures a maximum inlining experience. On the other hand,
1915 // static call edges are not so important when they don't correspond to a
1916 // context in the profile.
1917
1918 std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M);
1919 scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
1920 while (!CGI.isAtEnd()) {
1921 auto Range = *CGI;
1922 if (SortProfiledSCC) {
1923 // Sort nodes in one SCC based on callsite hotness.
1925 Range = *SI;
1926 }
1927 for (auto *Node : Range) {
1928 Function *F = SymbolMap.lookup(Node->Name);
1929 if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
1930 FunctionOrderList.push_back(F);
1931 }
1932 ++CGI;
1933 }
1934 } else {
1935 CG.buildRefSCCs();
1936 for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
1937 for (LazyCallGraph::SCC &C : RC) {
1938 for (LazyCallGraph::Node &N : C) {
1939 Function &F = N.getFunction();
1940 if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
1941 FunctionOrderList.push_back(&F);
1942 }
1943 }
1944 }
1945 }
1946
1947 std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
1948
1949 LLVM_DEBUG({
1950 dbgs() << "Function processing order:\n";
1951 for (auto F : FunctionOrderList) {
1952 dbgs() << F->getName() << "\n";
1953 }
1954 });
1955
1956 return FunctionOrderList;
1957}
1958
1959bool SampleProfileLoader::doInitialization(Module &M,
1961 auto &Ctx = M.getContext();
1962
1963 auto ReaderOrErr = SampleProfileReader::create(
1964 Filename, Ctx, *FS, FSDiscriminatorPass::Base, RemappingFilename);
1965 if (std::error_code EC = ReaderOrErr.getError()) {
1966 std::string Msg = "Could not open profile: " + EC.message();
1967 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1968 return false;
1969 }
1970 Reader = std::move(ReaderOrErr.get());
1971 Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
1972 // set module before reading the profile so reader may be able to only
1973 // read the function profiles which are used by the current module.
1974 Reader->setModule(&M);
1975 if (std::error_code EC = Reader->read()) {
1976 std::string Msg = "profile reading failed: " + EC.message();
1977 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1978 return false;
1979 }
1980
1981 PSL = Reader->getProfileSymbolList();
1982
1983 // While profile-sample-accurate is on, ignore symbol list.
1984 ProfAccForSymsInList =
1986 if (ProfAccForSymsInList) {
1987 NamesInProfile.clear();
1988 if (auto NameTable = Reader->getNameTable())
1989 NamesInProfile.insert(NameTable->begin(), NameTable->end());
1990 CoverageTracker.setProfAccForSymsInList(true);
1991 }
1992
1993 if (FAM && !ProfileInlineReplayFile.empty()) {
1994 ExternalInlineAdvisor = getReplayInlineAdvisor(
1995 M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
2000 /*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});
2001 }
2002
2003 // Apply tweaks if context-sensitive or probe-based profile is available.
2004 if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||
2005 Reader->profileIsProbeBased()) {
2006 if (!UseIterativeBFIInference.getNumOccurrences())
2008 if (!SampleProfileUseProfi.getNumOccurrences())
2009 SampleProfileUseProfi = true;
2010 if (!EnableExtTspBlockPlacement.getNumOccurrences())
2012 // Enable priority-base inliner and size inline by default for CSSPGO.
2013 if (!ProfileSizeInline.getNumOccurrences())
2014 ProfileSizeInline = true;
2015 if (!CallsitePrioritizedInline.getNumOccurrences())
2017 // For CSSPGO, we also allow recursive inline to best use context profile.
2018 if (!AllowRecursiveInline.getNumOccurrences())
2019 AllowRecursiveInline = true;
2020
2021 if (Reader->profileIsPreInlined()) {
2022 if (!UsePreInlinerDecision.getNumOccurrences())
2023 UsePreInlinerDecision = true;
2024 }
2025
2026 if (!Reader->profileIsCS()) {
2027 // Non-CS profile should be fine without a function size budget for the
2028 // inliner since the contexts in the profile are either all from inlining
2029 // in the prevoius build or pre-computed by the preinliner with a size
2030 // cap, thus they are bounded.
2031 if (!ProfileInlineLimitMin.getNumOccurrences())
2032 ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();
2033 if (!ProfileInlineLimitMax.getNumOccurrences())
2034 ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();
2035 }
2036 }
2037
2038 if (Reader->profileIsCS()) {
2039 // Tracker for profiles under different context
2040 ContextTracker = std::make_unique<SampleContextTracker>(
2041 Reader->getProfiles(), &GUIDToFuncNameMap);
2042 }
2043
2044 // Load pseudo probe descriptors for probe-based function samples.
2045 if (Reader->profileIsProbeBased()) {
2046 ProbeManager = std::make_unique<PseudoProbeManager>(M);
2047 if (!ProbeManager->moduleIsProbed(M)) {
2048 const char *Msg =
2049 "Pseudo-probe-based profile requires SampleProfileProbePass";
2050 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
2051 DS_Warning));
2052 return false;
2053 }
2054 }
2055
2057 MatchingManager =
2058 std::make_unique<SampleProfileMatcher>(M, *Reader, ProbeManager.get());
2059 }
2060
2061 return true;
2062}
2063
2064void SampleProfileMatcher::detectProfileMismatch(const Function &F,
2065 const FunctionSamples &FS) {
2067 uint64_t Count = FS.getTotalSamples();
2068 TotalFuncHashSamples += Count;
2069 TotalProfiledFunc++;
2070 if (!ProbeManager->profileIsValid(F, FS)) {
2071 MismatchedFuncHashSamples += Count;
2072 NumMismatchedFuncHash++;
2073 return;
2074 }
2075 }
2076
2077 std::unordered_set<LineLocation, LineLocationHash> MatchedCallsiteLocs;
2078
2079 // Go through all the callsites on the IR and flag the callsite if the target
2080 // name is the same as the one in the profile.
2081 for (auto &BB : F) {
2082 for (auto &I : BB) {
2083 if (!isa<CallBase>(&I) || isa<IntrinsicInst>(&I))
2084 continue;
2085
2086 const auto *CB = dyn_cast<CallBase>(&I);
2087 if (auto &DLoc = I.getDebugLoc()) {
2089
2090 StringRef CalleeName;
2091 if (Function *Callee = CB->getCalledFunction())
2092 CalleeName = FunctionSamples::getCanonicalFnName(Callee->getName());
2093
2094 const auto CTM = FS.findCallTargetMapAt(IRCallsite);
2095 const auto CallsiteFS = FS.findFunctionSamplesMapAt(IRCallsite);
2096
2097 // Indirect call case.
2098 if (CalleeName.empty()) {
2099 // Since indirect call does not have the CalleeName, check
2100 // conservatively if callsite in the profile is a callsite location.
2101 // This is to avoid nums of false positive since otherwise all the
2102 // indirect call samples will be reported as mismatching.
2103 if ((CTM && !CTM->empty()) || (CallsiteFS && !CallsiteFS->empty()))
2104 MatchedCallsiteLocs.insert(IRCallsite);
2105 } else {
2106 // Check if the call target name is matched for direct call case.
2107 if ((CTM && CTM->count(CalleeName)) ||
2108 (CallsiteFS && CallsiteFS->count(CalleeName)))
2109 MatchedCallsiteLocs.insert(IRCallsite);
2110 }
2111 }
2112 }
2113 }
2114
2115 auto isInvalidLineOffset = [](uint32_t LineOffset) {
2116 return LineOffset & 0x8000;
2117 };
2118
2119 // Check if there are any callsites in the profile that does not match to any
2120 // IR callsites, those callsite samples will be discarded.
2121 for (auto &I : FS.getBodySamples()) {
2122 const LineLocation &Loc = I.first;
2123 if (isInvalidLineOffset(Loc.LineOffset))
2124 continue;
2125
2126 uint64_t Count = I.second.getSamples();
2127 if (!I.second.getCallTargets().empty()) {
2128 TotalCallsiteSamples += Count;
2129 TotalProfiledCallsites++;
2130 if (!MatchedCallsiteLocs.count(Loc)) {
2131 MismatchedCallsiteSamples += Count;
2132 NumMismatchedCallsites++;
2133 }
2134 }
2135 }
2136
2137 for (auto &I : FS.getCallsiteSamples()) {
2138 const LineLocation &Loc = I.first;
2139 if (isInvalidLineOffset(Loc.LineOffset))
2140 continue;
2141
2142 uint64_t Count = 0;
2143 for (auto &FM : I.second) {
2144 Count += FM.second.getHeadSamplesEstimate();
2145 }
2146 TotalCallsiteSamples += Count;
2147 TotalProfiledCallsites++;
2148 if (!MatchedCallsiteLocs.count(Loc)) {
2149 MismatchedCallsiteSamples += Count;
2150 NumMismatchedCallsites++;
2151 }
2152 }
2153}
2154
2155void SampleProfileMatcher::detectProfileMismatch() {
2156 for (auto &F : M) {
2157 if (F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"))
2158 continue;
2159 FunctionSamples *FS = Reader.getSamplesFor(F);
2160 if (!FS)
2161 continue;
2162 detectProfileMismatch(F, *FS);
2163 }
2164
2167 errs() << "(" << NumMismatchedFuncHash << "/" << TotalProfiledFunc << ")"
2168 << " of functions' profile are invalid and "
2169 << " (" << MismatchedFuncHashSamples << "/" << TotalFuncHashSamples
2170 << ")"
2171 << " of samples are discarded due to function hash mismatch.\n";
2172 }
2173 errs() << "(" << NumMismatchedCallsites << "/" << TotalProfiledCallsites
2174 << ")"
2175 << " of callsites' profile are invalid and "
2176 << "(" << MismatchedCallsiteSamples << "/" << TotalCallsiteSamples
2177 << ")"
2178 << " of samples are discarded due to callsite location mismatch.\n";
2179 }
2180
2182 LLVMContext &Ctx = M.getContext();
2183 MDBuilder MDB(Ctx);
2184
2187 ProfStatsVec.emplace_back("NumMismatchedFuncHash", NumMismatchedFuncHash);
2188 ProfStatsVec.emplace_back("TotalProfiledFunc", TotalProfiledFunc);
2189 ProfStatsVec.emplace_back("MismatchedFuncHashSamples",
2190 MismatchedFuncHashSamples);
2191 ProfStatsVec.emplace_back("TotalFuncHashSamples", TotalFuncHashSamples);
2192 }
2193
2194 ProfStatsVec.emplace_back("NumMismatchedCallsites", NumMismatchedCallsites);
2195 ProfStatsVec.emplace_back("TotalProfiledCallsites", TotalProfiledCallsites);
2196 ProfStatsVec.emplace_back("MismatchedCallsiteSamples",
2197 MismatchedCallsiteSamples);
2198 ProfStatsVec.emplace_back("TotalCallsiteSamples", TotalCallsiteSamples);
2199
2200 auto *MD = MDB.createLLVMStats(ProfStatsVec);
2201 auto *NMD = M.getOrInsertNamedMetadata("llvm.stats");
2202 NMD->addOperand(MD);
2203 }
2204}
2205
2206bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
2207 ProfileSummaryInfo *_PSI,
2208 LazyCallGraph &CG) {
2209 GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
2210
2211 PSI = _PSI;
2212 if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
2213 M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
2215 PSI->refresh();
2216 }
2217 // Compute the total number of samples collected in this profile.
2218 for (const auto &I : Reader->getProfiles())
2219 TotalCollectedSamples += I.second.getTotalSamples();
2220
2221 auto Remapper = Reader->getRemapper();
2222 // Populate the symbol map.
2223 for (const auto &N_F : M.getValueSymbolTable()) {
2224 StringRef OrigName = N_F.getKey();
2225 Function *F = dyn_cast<Function>(N_F.getValue());
2226 if (F == nullptr || OrigName.empty())
2227 continue;
2228 SymbolMap[OrigName] = F;
2230 if (OrigName != NewName && !NewName.empty()) {
2231 auto r = SymbolMap.insert(std::make_pair(NewName, F));
2232 // Failiing to insert means there is already an entry in SymbolMap,
2233 // thus there are multiple functions that are mapped to the same
2234 // stripped name. In this case of name conflicting, set the value
2235 // to nullptr to avoid confusion.
2236 if (!r.second)
2237 r.first->second = nullptr;
2238 OrigName = NewName;
2239 }
2240 // Insert the remapped names into SymbolMap.
2241 if (Remapper) {
2242 if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
2243 if (*MapName != OrigName && !MapName->empty())
2244 SymbolMap.insert(std::make_pair(*MapName, F));
2245 }
2246 }
2247 }
2248 assert(SymbolMap.count(StringRef()) == 0 &&
2249 "No empty StringRef should be added in SymbolMap");
2250
2252 MatchingManager->detectProfileMismatch();
2253
2254 bool retval = false;
2255 for (auto *F : buildFunctionOrder(M, CG)) {
2256 assert(!F->isDeclaration());
2257 clearFunctionData();
2258 retval |= runOnFunction(*F, AM);
2259 }
2260
2261 // Account for cold calls not inlined....
2263 for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
2264 notInlinedCallInfo)
2265 updateProfileCallee(pair.first, pair.second.entryCount);
2266
2267 return retval;
2268}
2269
2270bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
2271 LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
2272 DILocation2SampleMap.clear();
2273 // By default the entry count is initialized to -1, which will be treated
2274 // conservatively by getEntryCount as the same as unknown (None). This is
2275 // to avoid newly added code to be treated as cold. If we have samples
2276 // this will be overwritten in emitAnnotations.
2277 uint64_t initialEntryCount = -1;
2278
2279 ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
2280 if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
2281 // initialize all the function entry counts to 0. It means all the
2282 // functions without profile will be regarded as cold.
2283 initialEntryCount = 0;
2284 // profile-sample-accurate is a user assertion which has a higher precedence
2285 // than symbol list. When profile-sample-accurate is on, ignore symbol list.
2286 ProfAccForSymsInList = false;
2287 }
2288 CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
2289
2290 // PSL -- profile symbol list include all the symbols in sampled binary.
2291 // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
2292 // old functions without samples being cold, without having to worry
2293 // about new and hot functions being mistakenly treated as cold.
2294 if (ProfAccForSymsInList) {
2295 // Initialize the entry count to 0 for functions in the list.
2296 if (PSL->contains(F.getName()))
2297 initialEntryCount = 0;
2298
2299 // Function in the symbol list but without sample will be regarded as
2300 // cold. To minimize the potential negative performance impact it could
2301 // have, we want to be a little conservative here saying if a function
2302 // shows up in the profile, no matter as outline function, inline instance
2303 // or call targets, treat the function as not being cold. This will handle
2304 // the cases such as most callsites of a function are inlined in sampled
2305 // binary but not inlined in current build (because of source code drift,
2306 // imprecise debug information, or the callsites are all cold individually
2307 // but not cold accumulatively...), so the outline function showing up as
2308 // cold in sampled binary will actually not be cold after current build.
2310 if (NamesInProfile.count(CanonName))
2311 initialEntryCount = -1;
2312 }
2313
2314 // Initialize entry count when the function has no existing entry
2315 // count value.
2316 if (!F.getEntryCount())
2317 F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
2318 std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
2319 if (AM) {
2320 auto &FAM =
2322 .getManager();
2324 } else {
2325 OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
2326 ORE = OwnedORE.get();
2327 }
2328
2330 Samples = ContextTracker->getBaseSamplesFor(F);
2331 else
2332 Samples = Reader->getSamplesFor(F);
2333
2334 if (Samples && !Samples->empty())
2335 return emitAnnotations(F);
2336 return false;
2337}
2339 std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase,
2341 : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
2342 LTOPhase(LTOPhase), FS(std::move(FS)) {}
2343
2348
2349 auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
2351 };
2352 auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
2354 };
2355 auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
2357 };
2358
2359 if (!FS)
2361
2362 SampleProfileLoader SampleLoader(
2363 ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
2364 ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
2365 : ProfileRemappingFileName,
2366 LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI);
2367
2368 if (!SampleLoader.doInitialization(M, &FAM))
2369 return PreservedAnalyses::all();
2370
2373 if (!SampleLoader.runOnModule(M, &AM, PSI, CG))
2374 return PreservedAnalyses::all();
2375
2376 return PreservedAnalyses::none();
2377}
This file defines the StringMap class.
amdgpu Simplify well known AMD library false FunctionCallee Callee
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:678
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
static bool runOnFunction(Function &F, bool PostInlining)
Provides ErrorOr<T> smart pointer.
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
LVReader * CurrentReader
Definition: LVReader.cpp:153
Implements a lazy call graph analysis and related passes for the new pass manager.
Statically lint checks LLVM IR
Definition: Lint.cpp:746
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PriorityQueue class.
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides the interface for context-sensitive profile tracker used by CSSPGO.
This file provides the interface for the sampled PGO profile loader base implementation.
This file provides the utility functions for the sampled PGO loader base implementation.
This file provides the interface for the pseudo probe implementation for AutoFDO.
static cl::opt< std::string > SampleProfileFile("sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden)
static cl::opt< bool > ProfileSampleBlockAccurate("profile-sample-block-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "branches and calls as having 0 samples. Otherwise, treat " "them conservatively as unknown. "))
static cl::opt< unsigned > MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden, cl::desc("Max number of promotions for a single indirect " "call callsite in sample profile loader"))
static cl::opt< ReplayInlinerSettings::Fallback > ProfileInlineReplayFallback("sample-profile-inline-replay-fallback", cl::init(ReplayInlinerSettings::Fallback::Original), cl::values(clEnumValN(ReplayInlinerSettings::Fallback::Original, "Original", "All decisions not in replay send to original advisor (default)"), clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, "AlwaysInline", "All decisions not in replay are inlined"), clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline", "All decisions not in replay are not inlined")), cl::desc("How sample profile inline replay treats sites that don't come " "from the replay. Original: defers to original advisor, " "AlwaysInline: inline all sites not in replay, NeverInline: " "inline no sites not in replay"), cl::Hidden)
static cl::opt< bool > OverwriteExistingWeights("overwrite-existing-weights", cl::Hidden, cl::init(false), cl::desc("Ignore existing branch weights on IR and always overwrite."))
static void updateIDTMetaData(Instruction &Inst, const SmallVectorImpl< InstrProfValueData > &CallTargets, uint64_t Sum)
Update indirect call target profile metadata for Inst.
static cl::opt< bool > AnnotateSampleProfileInlinePhase("annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false), cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " "sample-profile inline pass name."))
static cl::opt< std::string > ProfileInlineReplayFile("sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"), cl::desc("Optimization remarks file containing inline remarks to be replayed " "by inlining from sample profile loader."), cl::Hidden)
static cl::opt< bool > ProfileMergeInlinee("sample-profile-merge-inlinee", cl::Hidden, cl::init(true), cl::desc("Merge past inlinee's profile to outline version if sample " "profile loader decided not to inline a call site. It will " "only be enabled when top-down order of profile loading is " "enabled. "))
static cl::opt< bool > PersistProfileStaleness("persist-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute stale profile statistical metrics and write it into the " "native object file(.llvm_stats section)."))
static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate)
Check whether the indirect call promotion history of Inst allows the promotion for Candidate.
static SmallVector< InstrProfValueData, 2 > GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M)
Returns the sorted CallTargetMap M by count in descending order.
#define CSINLINE_DEBUG
static cl::opt< bool > UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden, cl::desc("Process functions in a top-down order " "defined by the profiled call graph when " "-sample-profile-top-down-load is on."))
static cl::opt< ReplayInlinerSettings::Scope > ProfileInlineReplayScope("sample-profile-inline-replay-scope", cl::init(ReplayInlinerSettings::Scope::Function), cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function", "Replay on functions that have remarks associated " "with them (default)"), clEnumValN(ReplayInlinerSettings::Scope::Module, "Module", "Replay on the entire module")), cl::desc("Whether inline replay should be applied to the entire " "Module or just the Functions (default) that are present as " "callers in remarks during sample profile inlining."), cl::Hidden)
static cl::opt< unsigned > ProfileICPRelativeHotness("sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25), cl::desc("Relative hotness percentage threshold for indirect " "call promotion in proirity-based sample profile loader inlining."))
Function::ProfileCount ProfileCount
static cl::opt< unsigned > ProfileICPRelativeHotnessSkip("sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1), cl::desc("Skip relative hotness check for ICP up to given number of targets."))
static cl::opt< bool > ReportProfileStaleness("report-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute and report stale profile statistical metrics."))
static cl::opt< bool > UsePreInlinerDecision("sample-profile-use-preinliner", cl::Hidden, cl::desc("Use the preinliner decisions stored in profile context."))
static cl::opt< bool > ProfileAccurateForSymsInList("profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. "))
#define DEBUG_TYPE
static cl::opt< bool > DisableSampleLoaderInlining("disable-sample-loader-inlining", cl::Hidden, cl::init(false), cl::desc("If true, artifically skip inline transformation in sample-loader " "pass, and merge (or scale) profiles (as configured by " "--sample-profile-merge-inlinee)."))
static cl::opt< bool > ProfileSizeInline("sample-profile-inline-size", cl::Hidden, cl::init(false), cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size."))
static cl::opt< bool > ProfileTopDownLoad("sample-profile-top-down-load", cl::Hidden, cl::init(true), cl::desc("Do profile annotation and inlining for functions in top-down " "order of call graph during sample profile loading. It only " "works for new pass manager. "))
static cl::opt< bool > ProfileSampleAccurate("profile-sample-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. "))
static cl::opt< bool > AllowRecursiveInline("sample-profile-recursive-inline", cl::Hidden, cl::desc("Allow sample loader inliner to inline recursive calls."))
static cl::opt< CallSiteFormat::Format > ProfileInlineReplayFormat("sample-profile-inline-replay-format", cl::init(CallSiteFormat::Format::LineColumnDiscriminator), cl::values(clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"), clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn", "<Line Number>:<Column Number>"), clEnumValN(CallSiteFormat::Format::LineDiscriminator, "LineDiscriminator", "<Line Number>.<Discriminator>"), clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, "LineColumnDiscriminator", "<Line Number>:<Column Number>.<Discriminator> (default)")), cl::desc("How sample profile inline replay file is formatted"), cl::Hidden)
static cl::opt< std::string > SampleProfileRemappingFile("sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden)
static cl::opt< bool > CallsitePrioritizedInline("sample-profile-prioritized-inline", cl::Hidden, cl::desc("Use call site prioritized inlining for sample profile loader." "Currently only CSSPGO is supported."))
This file provides the interface for the sampled PGO loader pass.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1186
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
Debug location.
A debug info location.
Definition: DebugLoc.h:33
unsigned getLine() const
Definition: DebugLoc.cpp:24
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Diagnostic information for the sample profiler.
Represents either an error or a value T.
Definition: ErrorOr.h:56
Class to represent profile counts.
Definition: Function.h:252
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1625
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:275
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:203
InlineResult is basically true or false.
Definition: InlineCost.h:179
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:933
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:358
bool extractProfTotalWeight(uint64_t &TotalVal) const
Retrieve total raw weight values of a branch.
Definition: Metadata.cpp:1539
const BasicBlock * getParent() const
Definition: Instruction.h:90
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1455
A smart pointer to a reference-counted object that inherits from RefCountedBase or ThreadSafeRefCount...
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
An analysis pass which computes the call graph for a module.
A node in the call graph.
A RefSCC of the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
VectorType::iterator erase(typename VectorType::iterator Iterator)
Remove the element given by Iterator.
Definition: MapVector.h:173
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:118
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:111
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Diagnostic information for optimization analysis remarks.
Diagnostic information for applied optimization remarks.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
virtual ErrorOr< uint64_t > getInstWeight(const InstructionT &Inst)
Get the weight for an instruction.
virtual const FunctionSamples * findFunctionSamples(const InstructionT &I) const
Get the FunctionSamples for an instruction.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
SampleProfileLoaderPass(std::string File="", std::string RemappingFile="", ThinOrFullLTOPhase LTOPhase=ThinOrFullLTOPhase::None, IntrusiveRefCntPtr< vfs::FileSystem > FS=nullptr)
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:809
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:111
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:23
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
Representation of the samples collected for a function.
Definition: SampleProf.h:726
static uint64_t getGUID(StringRef Name)
Definition: SampleProf.h:1152
void findInlinedFunctions(DenseSet< GlobalValue::GUID > &S, const StringMap< Function * > &SymbolMap, uint64_t Threshold) const
Recursively traverses all children, if the total sample count of the corresponding function is no les...
Definition: SampleProf.h:995
static StringRef getCanonicalFnName(const Function &F)
Return the canonical name for a function, taking into account suffix elision policy attributes.
Definition: SampleProf.h:1036
StringRef getFuncName() const
Return the original function name.
Definition: SampleProf.h:1028
SampleContext & getContext() const
Definition: SampleProf.h:1132
sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight=1)
Merge the samples in Other into this one.
Definition: SampleProf.h:955
static LineLocation getCallSiteIdentifier(const DILocation *DIL, bool ProfileIsFS=false)
Returns a unique call site identifier for a given debug location of a call instruction.
Definition: SampleProf.cpp:221
uint64_t getHeadSamplesEstimate() const
Return an estimate of the sample count of the function entry basic block.
Definition: SampleProf.h:906
StringRef getName() const
Return the function name.
Definition: SampleProf.h:1025
const BodySampleMap & getBodySamples() const
Return all the samples collected in the body of the function.
Definition: SampleProf.h:930
static bool UseMD5
Whether the profile uses MD5 to represent string.
Definition: SampleProf.h:1137
bool hasAttribute(ContextAttributeMask A)
Definition: SampleProf.h:596
Sample-based profile reader.
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(const std::string Filename, LLVMContext &C, vfs::FileSystem &FS, FSDiscriminatorPass P=FSDiscriminatorPass::Base, const std::string RemapFilename="")
Create a sample profile reader appropriate to the file format.
static const SortedCallTargetSet SortCallTargets(const CallTargetMap &Targets)
Sort call targets in descending order of call frequency.
Definition: SampleProf.h:406
static const CallTargetMap adjustCallTargets(const CallTargetMap &Targets, float DistributionFactor)
Prorate call targets by a distribution factor.
Definition: SampleProf.h:415
Enumerate the SCCs of a directed graph in reverse topological order of the SCC DAG.
Definition: SCCIterator.h:48
bool isAtEnd() const
Direct loop termination test which is more efficient than comparison with end().
Definition: SCCIterator.h:112
Sort the nodes of a directed SCC in the decreasing order of the edge weights.
Definition: SCCIterator.h:252
const CustomOperand< const MCSubtargetInfo & > Msg[]
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FS
Definition: X86.h:208
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:703
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
void checkExpectAnnotations(Instruction &I, const ArrayRef< uint32_t > ExistingWeights, bool IsFrontend)
checkExpectAnnotations - compares PGO counters to the thresholds used for llvm.expect and warns if th...
Definition: MisExpect.cpp:202
DenseMap< SymbolStringPtr, JITEvaluatedSymbol > SymbolMap
A map from symbol names (as SymbolStringPtrs) to JITSymbols (address/flags pairs).
Definition: Core.h:118
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
std::map< std::string, FunctionSamples, std::less<> > FunctionSamplesMap
Definition: SampleProf.h:718
bool callsiteIsHot(const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI, bool ProfAccForSymsInList)
Return true if the given callsite is hot wrt to hot cutoff threshold.
IntrusiveRefCntPtr< FileSystem > getRealFileSystem()
Gets an vfs::FileSystem for the 'real' file system, as seen by the operating system.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
static bool isIndirectCall(const MachineInstr &MI)
bool getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], uint32_t &ActualNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst which is annotated with value profile meta data.
Definition: InstrProf.cpp:1063
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
cl::opt< int > ProfileInlineLimitMin("sample-profile-inline-limit-min", cl::Hidden, cl::init(100), cl::desc("The lower bound of size growth limit for " "proirity-based sample profile loader inlining."))
cl::opt< int > ProfileInlineGrowthLimit("sample-profile-inline-growth-limit", cl::Hidden, cl::init(12), cl::desc("The size growth ratio limit for proirity-based sample profile " "loader inlining."))
scc_iterator< T > scc_begin(const T &G)
Construct the begin iterator for a deduced graph type T.
Definition: SCCIterator.h:232
void setProbeDistributionFactor(Instruction &Inst, float Factor)
Definition: PseudoProbe.cpp:66
std::string AnnotateInlinePassName(InlineContext IC)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:73
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
cl::opt< bool > SampleProfileUseProfi
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1018
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1730
llvm::cl::opt< bool > UseIterativeBFIInference
std::optional< PseudoProbe > extractProbe(const Instruction &Inst)
Definition: PseudoProbe.cpp:49
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void emitInlinedIntoBasedOnCost(OptimizationRemarkEmitter &ORE, DebugLoc DLoc, const BasicBlock *Block, const Function &Callee, const Function &Caller, const InlineCost &IC, bool ForProfileContext=false, const char *PassName=nullptr)
Emit ORE message based in cost (default heuristic).
cl::opt< bool > SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden, cl::desc("Sort profiled recursion by edge weights."))
std::unique_ptr< InlineAdvisor > getReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context, std::unique_ptr< InlineAdvisor > OriginalAdvisor, const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks, InlineContext IC)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< int > ProfileInlineLimitMax("sample-profile-inline-limit-max", cl::Hidden, cl::init(10000), cl::desc("The upper bound of size growth limit for " "proirity-based sample profile loader inlining."))
cl::opt< int > SampleHotCallSiteThreshold("sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000), cl::desc("Hot callsite threshold for proirity-based sample profile loader " "inlining."))
void updateProfileCallee(Function *Callee, int64_t EntryDelta, const ValueMap< const Value *, WeakTrackingVH > *VMap=nullptr)
Updates profile information by adjusting the entry count by adding EntryDelta then scaling callsite i...
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1909
@ DS_Warning
cl::opt< bool > EnableExtTspBlockPlacement
const uint64_t NOMORE_ICP_MAGICNUM
Magic number in the value profile metadata showing a target has been promoted for the instruction and...
Definition: Metadata.h:56
cl::opt< int > SampleColdCallSiteThreshold("sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites"))
Definition: BitVector.h:851
#define N
Used in the streaming interface as the general argument type.
Provides context on when an inline advisor is constructed in the pipeline (e.g., link phase,...
Definition: InlineAdvisor.h:60
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
std::optional< bool > ComputeFullInlineCost
Compute inline cost even when the cost has exceeded the threshold.
Definition: InlineCost.h:232
Represents the relative location of an instruction.
Definition: SampleProf.h:283