LLVM 19.0.0git
InlineCost.cpp
Go to the documentation of this file.
1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
18#include "llvm/ADT/Statistic.h"
31#include "llvm/Config/llvm-config.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/Dominators.h"
37#include "llvm/IR/GlobalAlias.h"
38#include "llvm/IR/InstVisitor.h"
40#include "llvm/IR/Operator.h"
43#include "llvm/Support/Debug.h"
46#include <climits>
47#include <limits>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "inline-cost"
53
54STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
55
56static cl::opt<int>
57 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
58 cl::desc("Default amount of inlining to perform"));
59
60// We introduce this option since there is a minor compile-time win by avoiding
61// addition of TTI attributes (target-features in particular) to inline
62// candidates when they are guaranteed to be the same as top level methods in
63// some use cases. If we avoid adding the attribute, we need an option to avoid
64// checking these attributes.
66 "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
67 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68 "during inline cost calculation"));
69
71 "print-instruction-comments", cl::Hidden, cl::init(false),
72 cl::desc("Prints comments for instruction based on inline cost analysis"));
73
75 "inline-threshold", cl::Hidden, cl::init(225),
76 cl::desc("Control the amount of inlining to perform (default = 225)"));
77
79 "inlinehint-threshold", cl::Hidden, cl::init(325),
80 cl::desc("Threshold for inlining functions with inline hint"));
81
82static cl::opt<int>
83 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84 cl::init(45),
85 cl::desc("Threshold for inlining cold callsites"));
86
88 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
89 cl::desc("Enable the cost-benefit analysis for the inliner"));
90
91// InlineSavingsMultiplier overrides per TTI multipliers iff it is
92// specified explicitly in command line options. This option is exposed
93// for tuning and testing.
95 "inline-savings-multiplier", cl::Hidden, cl::init(8),
96 cl::desc("Multiplier to multiply cycle savings by during inlining"));
97
98// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99// specified explicitly in command line options. This option is exposed
100// for tuning and testing.
102 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
103 cl::desc("A multiplier on top of cycle savings to decide whether the "
104 "savings won't justify the cost"));
105
106static cl::opt<int>
107 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
108 cl::desc("The maximum size of a callee that get's "
109 "inlined without sufficient cycle savings"));
110
111// We introduce this threshold to help performance of instrumentation based
112// PGO before we actually hook up inliner with analysis passes such as BPI and
113// BFI.
115 "inlinecold-threshold", cl::Hidden, cl::init(45),
116 cl::desc("Threshold for inlining functions with cold attribute"));
117
118static cl::opt<int>
119 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
120 cl::desc("Threshold for hot callsites "));
121
123 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
124 cl::desc("Threshold for locally hot callsites "));
125
127 "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
128 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129 "entry frequency, for a callsite to be cold in the absence of "
130 "profile information."));
131
133 "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
134 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135 "entry frequency, for a callsite to be hot in the absence of "
136 "profile information."));
137
138static cl::opt<int>
139 InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
140 cl::desc("Cost of a single instruction when inlining"));
141
142static cl::opt<int>
143 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
144 cl::desc("Cost of load/store instruction when inlining"));
145
147 "inline-call-penalty", cl::Hidden, cl::init(25),
148 cl::desc("Call penalty that is applied per callsite when inlining"));
149
150static cl::opt<size_t>
151 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
152 cl::init(std::numeric_limits<size_t>::max()),
153 cl::desc("Do not inline functions with a stack size "
154 "that exceeds the specified limit"));
155
157 "recursive-inline-max-stacksize", cl::Hidden,
159 cl::desc("Do not inline recursive functions with a stack "
160 "size that exceeds the specified limit"));
161
163 "inline-cost-full", cl::Hidden,
164 cl::desc("Compute the full inline cost of a call site even when the cost "
165 "exceeds the threshold."));
166
168 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
169 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
170 "attributes."));
171
173 "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
174 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
175
176namespace llvm {
177std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
178 if (Attr.isValid()) {
179 int AttrValue = 0;
180 if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
181 return AttrValue;
182 }
183 return std::nullopt;
184}
185
186std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
187 return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
188}
189
190std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
191 return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
192}
193
194namespace InlineConstants {
195int getInstrCost() { return InstrCost; }
196
197} // namespace InlineConstants
198
199} // namespace llvm
200
201namespace {
202class InlineCostCallAnalyzer;
203
204// This struct is used to store information about inline cost of a
205// particular instruction
206struct InstructionCostDetail {
207 int CostBefore = 0;
208 int CostAfter = 0;
209 int ThresholdBefore = 0;
210 int ThresholdAfter = 0;
211
212 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
213
214 int getCostDelta() const { return CostAfter - CostBefore; }
215
216 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
217};
218
219class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
220private:
221 InlineCostCallAnalyzer *const ICCA;
222
223public:
224 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
226 formatted_raw_ostream &OS) override;
227};
228
229/// Carry out call site analysis, in order to evaluate inlinability.
230/// NOTE: the type is currently used as implementation detail of functions such
231/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232/// expectation is that they come from the outer scope, from the wrapper
233/// functions. If we want to support constructing CallAnalyzer objects where
234/// lambdas are provided inline at construction, or where the object needs to
235/// otherwise survive past the scope of the provided functions, we need to
236/// revisit the argument types.
237class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
239 friend class InstVisitor<CallAnalyzer, bool>;
240
241protected:
242 virtual ~CallAnalyzer() = default;
243 /// The TargetTransformInfo available for this compilation.
245
246 /// Getter for the cache of @llvm.assume intrinsics.
247 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
248
249 /// Getter for BlockFrequencyInfo
251
252 /// Profile summary information.
254
255 /// The called function.
256 Function &F;
257
258 // Cache the DataLayout since we use it a lot.
259 const DataLayout &DL;
260
261 /// The OptimizationRemarkEmitter available for this compilation.
263
264 /// The candidate callsite being analyzed. Please do not use this to do
265 /// analysis in the caller function; we want the inline cost query to be
266 /// easily cacheable. Instead, use the cover function paramHasAttr.
267 CallBase &CandidateCall;
268
269 /// Extension points for handling callsite features.
270 // Called before a basic block was analyzed.
271 virtual void onBlockStart(const BasicBlock *BB) {}
272
273 /// Called after a basic block was analyzed.
274 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
275
276 /// Called before an instruction was analyzed
277 virtual void onInstructionAnalysisStart(const Instruction *I) {}
278
279 /// Called after an instruction was analyzed
280 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
281
282 /// Called at the end of the analysis of the callsite. Return the outcome of
283 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
284 /// the reason it can't.
285 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
286 /// Called when we're about to start processing a basic block, and every time
287 /// we are done processing an instruction. Return true if there is no point in
288 /// continuing the analysis (e.g. we've determined already the call site is
289 /// too expensive to inline)
290 virtual bool shouldStop() { return false; }
291
292 /// Called before the analysis of the callee body starts (with callsite
293 /// contexts propagated). It checks callsite-specific information. Return a
294 /// reason analysis can't continue if that's the case, or 'true' if it may
295 /// continue.
296 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
297 /// Called if the analysis engine decides SROA cannot be done for the given
298 /// alloca.
299 virtual void onDisableSROA(AllocaInst *Arg) {}
300
301 /// Called the analysis engine determines load elimination won't happen.
302 virtual void onDisableLoadElimination() {}
303
304 /// Called when we visit a CallBase, before the analysis starts. Return false
305 /// to stop further processing of the instruction.
306 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
307
308 /// Called to account for a call.
309 virtual void onCallPenalty() {}
310
311 /// Called to account for a load or store.
312 virtual void onMemAccess(){};
313
314 /// Called to account for the expectation the inlining would result in a load
315 /// elimination.
316 virtual void onLoadEliminationOpportunity() {}
317
318 /// Called to account for the cost of argument setup for the Call in the
319 /// callee's body (not the callsite currently under analysis).
320 virtual void onCallArgumentSetup(const CallBase &Call) {}
321
322 /// Called to account for a load relative intrinsic.
323 virtual void onLoadRelativeIntrinsic() {}
324
325 /// Called to account for a lowered call.
326 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
327 }
328
329 /// Account for a jump table of given size. Return false to stop further
330 /// processing the switch instruction
331 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
332
333 /// Account for a case cluster of given size. Return false to stop further
334 /// processing of the instruction.
335 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
336
337 /// Called at the end of processing a switch instruction, with the given
338 /// number of case clusters.
339 virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
340 bool DefaultDestUndefined) {}
341
342 /// Called to account for any other instruction not specifically accounted
343 /// for.
344 virtual void onMissedSimplification() {}
345
346 /// Start accounting potential benefits due to SROA for the given alloca.
347 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
348
349 /// Account SROA savings for the AllocaInst value.
350 virtual void onAggregateSROAUse(AllocaInst *V) {}
351
352 bool handleSROA(Value *V, bool DoNotDisable) {
353 // Check for SROA candidates in comparisons.
354 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
355 if (DoNotDisable) {
356 onAggregateSROAUse(SROAArg);
357 return true;
358 }
359 disableSROAForArg(SROAArg);
360 }
361 return false;
362 }
363
364 bool IsCallerRecursive = false;
365 bool IsRecursiveCall = false;
366 bool ExposesReturnsTwice = false;
367 bool HasDynamicAlloca = false;
368 bool ContainsNoDuplicateCall = false;
369 bool HasReturn = false;
370 bool HasIndirectBr = false;
371 bool HasUninlineableIntrinsic = false;
372 bool InitsVargArgs = false;
373
374 /// Number of bytes allocated statically by the callee.
375 uint64_t AllocatedSize = 0;
376 unsigned NumInstructions = 0;
377 unsigned NumVectorInstructions = 0;
378
379 /// While we walk the potentially-inlined instructions, we build up and
380 /// maintain a mapping of simplified values specific to this callsite. The
381 /// idea is to propagate any special information we have about arguments to
382 /// this call through the inlinable section of the function, and account for
383 /// likely simplifications post-inlining. The most important aspect we track
384 /// is CFG altering simplifications -- when we prove a basic block dead, that
385 /// can cause dramatic shifts in the cost of inlining a function.
386 DenseMap<Value *, Constant *> SimplifiedValues;
387
388 /// Keep track of the values which map back (through function arguments) to
389 /// allocas on the caller stack which could be simplified through SROA.
391
392 /// Keep track of Allocas for which we believe we may get SROA optimization.
393 DenseSet<AllocaInst *> EnabledSROAAllocas;
394
395 /// Keep track of values which map to a pointer base and constant offset.
397
398 /// Keep track of dead blocks due to the constant arguments.
400
401 /// The mapping of the blocks to their known unique successors due to the
402 /// constant arguments.
404
405 /// Model the elimination of repeated loads that is expected to happen
406 /// whenever we simplify away the stores that would otherwise cause them to be
407 /// loads.
408 bool EnableLoadElimination = true;
409
410 /// Whether we allow inlining for recursive call.
411 bool AllowRecursiveCall = false;
412
413 SmallPtrSet<Value *, 16> LoadAddrSet;
414
415 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
416 auto It = SROAArgValues.find(V);
417 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
418 return nullptr;
419 return It->second;
420 }
421
422 // Custom simplification helper routines.
423 bool isAllocaDerivedArg(Value *V);
424 void disableSROAForArg(AllocaInst *SROAArg);
425 void disableSROA(Value *V);
426 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
427 void disableLoadElimination();
428 bool isGEPFree(GetElementPtrInst &GEP);
429 bool canFoldInboundsGEP(GetElementPtrInst &I);
430 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
431 bool simplifyCallSite(Function *F, CallBase &Call);
433 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
434 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
435 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
436
437 /// Return true if the given argument to the function being considered for
438 /// inlining has the given attribute set either at the call site or the
439 /// function declaration. Primarily used to inspect call site specific
440 /// attributes since these can be more precise than the ones on the callee
441 /// itself.
442 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
443
444 /// Return true if the given value is known non null within the callee if
445 /// inlined through this particular callsite.
446 bool isKnownNonNullInCallee(Value *V);
447
448 /// Return true if size growth is allowed when inlining the callee at \p Call.
449 bool allowSizeGrowth(CallBase &Call);
450
451 // Custom analysis routines.
452 InlineResult analyzeBlock(BasicBlock *BB,
454
455 // Disable several entry points to the visitor so we don't accidentally use
456 // them by declaring but not defining them here.
457 void visit(Module *);
458 void visit(Module &);
459 void visit(Function *);
460 void visit(Function &);
461 void visit(BasicBlock *);
462 void visit(BasicBlock &);
463
464 // Provide base case for our instruction visit.
466
467 // Our visit overrides.
468 bool visitAlloca(AllocaInst &I);
469 bool visitPHI(PHINode &I);
470 bool visitGetElementPtr(GetElementPtrInst &I);
471 bool visitBitCast(BitCastInst &I);
472 bool visitPtrToInt(PtrToIntInst &I);
473 bool visitIntToPtr(IntToPtrInst &I);
474 bool visitCastInst(CastInst &I);
475 bool visitCmpInst(CmpInst &I);
476 bool visitSub(BinaryOperator &I);
478 bool visitFNeg(UnaryOperator &I);
479 bool visitLoad(LoadInst &I);
480 bool visitStore(StoreInst &I);
481 bool visitExtractValue(ExtractValueInst &I);
482 bool visitInsertValue(InsertValueInst &I);
483 bool visitCallBase(CallBase &Call);
484 bool visitReturnInst(ReturnInst &RI);
485 bool visitBranchInst(BranchInst &BI);
486 bool visitSelectInst(SelectInst &SI);
487 bool visitSwitchInst(SwitchInst &SI);
489 bool visitResumeInst(ResumeInst &RI);
493
494public:
495 CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
496 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
497 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
498 ProfileSummaryInfo *PSI = nullptr,
499 OptimizationRemarkEmitter *ORE = nullptr)
500 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
501 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
502 CandidateCall(Call) {}
503
504 InlineResult analyze();
505
506 std::optional<Constant *> getSimplifiedValue(Instruction *I) {
507 if (SimplifiedValues.contains(I))
508 return SimplifiedValues[I];
509 return std::nullopt;
510 }
511
512 // Keep a bunch of stats about the cost savings found so we can print them
513 // out when debugging.
514 unsigned NumConstantArgs = 0;
515 unsigned NumConstantOffsetPtrArgs = 0;
516 unsigned NumAllocaArgs = 0;
517 unsigned NumConstantPtrCmps = 0;
518 unsigned NumConstantPtrDiffs = 0;
519 unsigned NumInstructionsSimplified = 0;
520
521 void dump();
522};
523
524// Considering forming a binary search, we should find the number of nodes
525// which is same as the number of comparisons when lowered. For a given
526// number of clusters, n, we can define a recursive function, f(n), to find
527// the number of nodes in the tree. The recursion is :
528// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
529// and f(n) = n, when n <= 3.
530// This will lead a binary tree where the leaf should be either f(2) or f(3)
531// when n > 3. So, the number of comparisons from leaves should be n, while
532// the number of non-leaf should be :
533// 2^(log2(n) - 1) - 1
534// = 2^log2(n) * 2^-1 - 1
535// = n / 2 - 1.
536// Considering comparisons from leaf and non-leaf nodes, we can estimate the
537// number of comparisons in a simple closed form :
538// n + n / 2 - 1 = n * 3 / 2 - 1
539int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
540 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
541}
542
543/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
544/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
545class InlineCostCallAnalyzer final : public CallAnalyzer {
546 const bool ComputeFullInlineCost;
547 int LoadEliminationCost = 0;
548 /// Bonus to be applied when percentage of vector instructions in callee is
549 /// high (see more details in updateThreshold).
550 int VectorBonus = 0;
551 /// Bonus to be applied when the callee has only one reachable basic block.
552 int SingleBBBonus = 0;
553
554 /// Tunable parameters that control the analysis.
555 const InlineParams &Params;
556
557 // This DenseMap stores the delta change in cost and threshold after
558 // accounting for the given instruction. The map is filled only with the
559 // flag PrintInstructionComments on.
561
562 /// Upper bound for the inlining cost. Bonuses are being applied to account
563 /// for speculative "expected profit" of the inlining decision.
564 int Threshold = 0;
565
566 /// The amount of StaticBonus applied.
567 int StaticBonusApplied = 0;
568
569 /// Attempt to evaluate indirect calls to boost its inline cost.
570 const bool BoostIndirectCalls;
571
572 /// Ignore the threshold when finalizing analysis.
573 const bool IgnoreThreshold;
574
575 // True if the cost-benefit-analysis-based inliner is enabled.
576 const bool CostBenefitAnalysisEnabled;
577
578 /// Inlining cost measured in abstract units, accounts for all the
579 /// instructions expected to be executed for a given function invocation.
580 /// Instructions that are statically proven to be dead based on call-site
581 /// arguments are not counted here.
582 int Cost = 0;
583
584 // The cumulative cost at the beginning of the basic block being analyzed. At
585 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
586 // the size of that basic block.
587 int CostAtBBStart = 0;
588
589 // The static size of live but cold basic blocks. This is "static" in the
590 // sense that it's not weighted by profile counts at all.
591 int ColdSize = 0;
592
593 // Whether inlining is decided by cost-threshold analysis.
594 bool DecidedByCostThreshold = false;
595
596 // Whether inlining is decided by cost-benefit analysis.
597 bool DecidedByCostBenefit = false;
598
599 // The cost-benefit pair computed by cost-benefit analysis.
600 std::optional<CostBenefitPair> CostBenefit;
601
602 bool SingleBB = true;
603
604 unsigned SROACostSavings = 0;
605 unsigned SROACostSavingsLost = 0;
606
607 /// The mapping of caller Alloca values to their accumulated cost savings. If
608 /// we have to disable SROA for one of the allocas, this tells us how much
609 /// cost must be added.
610 DenseMap<AllocaInst *, int> SROAArgCosts;
611
612 /// Return true if \p Call is a cold callsite.
613 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
614
615 /// Update Threshold based on callsite properties such as callee
616 /// attributes and callee hotness for PGO builds. The Callee is explicitly
617 /// passed to support analyzing indirect calls whose target is inferred by
618 /// analysis.
619 void updateThreshold(CallBase &Call, Function &Callee);
620 /// Return a higher threshold if \p Call is a hot callsite.
621 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
622 BlockFrequencyInfo *CallerBFI);
623
624 /// Handle a capped 'int' increment for Cost.
625 void addCost(int64_t Inc) {
626 Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);
627 Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);
628 }
629
630 void onDisableSROA(AllocaInst *Arg) override {
631 auto CostIt = SROAArgCosts.find(Arg);
632 if (CostIt == SROAArgCosts.end())
633 return;
634 addCost(CostIt->second);
635 SROACostSavings -= CostIt->second;
636 SROACostSavingsLost += CostIt->second;
637 SROAArgCosts.erase(CostIt);
638 }
639
640 void onDisableLoadElimination() override {
641 addCost(LoadEliminationCost);
642 LoadEliminationCost = 0;
643 }
644
645 bool onCallBaseVisitStart(CallBase &Call) override {
646 if (std::optional<int> AttrCallThresholdBonus =
647 getStringFnAttrAsInt(Call, "call-threshold-bonus"))
648 Threshold += *AttrCallThresholdBonus;
649
650 if (std::optional<int> AttrCallCost =
651 getStringFnAttrAsInt(Call, "call-inline-cost")) {
652 addCost(*AttrCallCost);
653 // Prevent further processing of the call since we want to override its
654 // inline cost, not just add to it.
655 return false;
656 }
657 return true;
658 }
659
660 void onCallPenalty() override { addCost(CallPenalty); }
661
662 void onMemAccess() override { addCost(MemAccessCost); }
663
664 void onCallArgumentSetup(const CallBase &Call) override {
665 // Pay the price of the argument setup. We account for the average 1
666 // instruction per call argument setup here.
667 addCost(Call.arg_size() * InstrCost);
668 }
669 void onLoadRelativeIntrinsic() override {
670 // This is normally lowered to 4 LLVM instructions.
671 addCost(3 * InstrCost);
672 }
673 void onLoweredCall(Function *F, CallBase &Call,
674 bool IsIndirectCall) override {
675 // We account for the average 1 instruction per call argument setup here.
676 addCost(Call.arg_size() * InstrCost);
677
678 // If we have a constant that we are calling as a function, we can peer
679 // through it and see the function target. This happens not infrequently
680 // during devirtualization and so we want to give it a hefty bonus for
681 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
682 // Pretend to inline the function, with a custom threshold.
683 if (IsIndirectCall && BoostIndirectCalls) {
684 auto IndirectCallParams = Params;
685 IndirectCallParams.DefaultThreshold =
687 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
688 /// to instantiate the derived class.
689 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
690 GetAssumptionCache, GetBFI, PSI, ORE, false);
691 if (CA.analyze().isSuccess()) {
692 // We were able to inline the indirect call! Subtract the cost from the
693 // threshold to get the bonus we want to apply, but don't go below zero.
694 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
695 }
696 } else
697 // Otherwise simply add the cost for merely making the call.
698 addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
699 CallPenalty));
700 }
701
702 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
703 bool DefaultDestUndefined) override {
704 if (!DefaultDestUndefined)
705 addCost(2 * InstrCost);
706 // If suitable for a jump table, consider the cost for the table size and
707 // branch to destination.
708 // Maximum valid cost increased in this function.
709 if (JumpTableSize) {
710 int64_t JTCost =
711 static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost;
712 addCost(JTCost);
713 return;
714 }
715
716 if (NumCaseCluster <= 3) {
717 // Suppose a comparison includes one compare and one conditional branch.
718 addCost(NumCaseCluster * 2 * InstrCost);
719 return;
720 }
721
722 int64_t ExpectedNumberOfCompare =
723 getExpectedNumberOfCompare(NumCaseCluster);
724 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
725
726 addCost(SwitchCost);
727 }
728 void onMissedSimplification() override { addCost(InstrCost); }
729
730 void onInitializeSROAArg(AllocaInst *Arg) override {
731 assert(Arg != nullptr &&
732 "Should not initialize SROA costs for null value.");
733 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
734 SROACostSavings += SROAArgCost;
735 SROAArgCosts[Arg] = SROAArgCost;
736 }
737
738 void onAggregateSROAUse(AllocaInst *SROAArg) override {
739 auto CostIt = SROAArgCosts.find(SROAArg);
740 assert(CostIt != SROAArgCosts.end() &&
741 "expected this argument to have a cost");
742 CostIt->second += InstrCost;
743 SROACostSavings += InstrCost;
744 }
745
746 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
747
748 void onBlockAnalyzed(const BasicBlock *BB) override {
749 if (CostBenefitAnalysisEnabled) {
750 // Keep track of the static size of live but cold basic blocks. For now,
751 // we define a cold basic block to be one that's never executed.
752 assert(GetBFI && "GetBFI must be available");
753 BlockFrequencyInfo *BFI = &(GetBFI(F));
754 assert(BFI && "BFI must be available");
755 auto ProfileCount = BFI->getBlockProfileCount(BB);
756 if (*ProfileCount == 0)
757 ColdSize += Cost - CostAtBBStart;
758 }
759
760 auto *TI = BB->getTerminator();
761 // If we had any successors at this point, than post-inlining is likely to
762 // have them as well. Note that we assume any basic blocks which existed
763 // due to branches or switches which folded above will also fold after
764 // inlining.
765 if (SingleBB && TI->getNumSuccessors() > 1) {
766 // Take off the bonus we applied to the threshold.
767 Threshold -= SingleBBBonus;
768 SingleBB = false;
769 }
770 }
771
772 void onInstructionAnalysisStart(const Instruction *I) override {
773 // This function is called to store the initial cost of inlining before
774 // the given instruction was assessed.
776 return;
777 InstructionCostDetailMap[I].CostBefore = Cost;
778 InstructionCostDetailMap[I].ThresholdBefore = Threshold;
779 }
780
781 void onInstructionAnalysisFinish(const Instruction *I) override {
782 // This function is called to find new values of cost and threshold after
783 // the instruction has been assessed.
785 return;
786 InstructionCostDetailMap[I].CostAfter = Cost;
787 InstructionCostDetailMap[I].ThresholdAfter = Threshold;
788 }
789
790 bool isCostBenefitAnalysisEnabled() {
791 if (!PSI || !PSI->hasProfileSummary())
792 return false;
793
794 if (!GetBFI)
795 return false;
796
797 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
798 // Honor the explicit request from the user.
800 return false;
801 } else {
802 // Otherwise, require instrumentation profile.
803 if (!(PSI->hasInstrumentationProfile() || PSI->hasSampleProfile()))
804 return false;
805 }
806
807 auto *Caller = CandidateCall.getParent()->getParent();
808 if (!Caller->getEntryCount())
809 return false;
810
811 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
812 if (!CallerBFI)
813 return false;
814
815 // For now, limit to hot call site.
816 if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
817 return false;
818
819 // Make sure we have a nonzero entry count.
820 auto EntryCount = F.getEntryCount();
821 if (!EntryCount || !EntryCount->getCount())
822 return false;
823
824 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
825 if (!CalleeBFI)
826 return false;
827
828 return true;
829 }
830
831 // A helper function to choose between command line override and default.
832 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
833 if (InlineSavingsMultiplier.getNumOccurrences())
836 }
837
838 // A helper function to choose between command line override and default.
839 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
840 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
843 }
844
845 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
846 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
847 CandidateCall, "inline-cycle-savings-for-test")) {
848 CycleSavings = *AttrCycleSavings;
849 }
850
851 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
852 CandidateCall, "inline-runtime-cost-for-test")) {
853 Size = *AttrRuntimeCost;
854 }
855 }
856
857 // Determine whether we should inline the given call site, taking into account
858 // both the size cost and the cycle savings. Return std::nullopt if we don't
859 // have sufficient profiling information to determine.
860 std::optional<bool> costBenefitAnalysis() {
861 if (!CostBenefitAnalysisEnabled)
862 return std::nullopt;
863
864 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
865 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
866 // falling back to the cost-based metric.
867 // TODO: Improve this hacky condition.
868 if (Threshold == 0)
869 return std::nullopt;
870
871 assert(GetBFI);
872 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
873 assert(CalleeBFI);
874
875 // The cycle savings expressed as the sum of InstrCost
876 // multiplied by the estimated dynamic count of each instruction we can
877 // avoid. Savings come from the call site cost, such as argument setup and
878 // the call instruction, as well as the instructions that are folded.
879 //
880 // We use 128-bit APInt here to avoid potential overflow. This variable
881 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
882 // assumes that we can avoid or fold a billion instructions, each with a
883 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
884 // period on a 4GHz machine.
885 APInt CycleSavings(128, 0);
886
887 for (auto &BB : F) {
888 APInt CurrentSavings(128, 0);
889 for (auto &I : BB) {
890 if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
891 // Count a conditional branch as savings if it becomes unconditional.
892 if (BI->isConditional() &&
893 isa_and_nonnull<ConstantInt>(
894 SimplifiedValues.lookup(BI->getCondition()))) {
895 CurrentSavings += InstrCost;
896 }
897 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
898 if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))
899 CurrentSavings += InstrCost;
900 } else if (Value *V = dyn_cast<Value>(&I)) {
901 // Count an instruction as savings if we can fold it.
902 if (SimplifiedValues.count(V)) {
903 CurrentSavings += InstrCost;
904 }
905 }
906 }
907
908 auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
909 CurrentSavings *= *ProfileCount;
910 CycleSavings += CurrentSavings;
911 }
912
913 // Compute the cycle savings per call.
914 auto EntryProfileCount = F.getEntryCount();
915 assert(EntryProfileCount && EntryProfileCount->getCount());
916 auto EntryCount = EntryProfileCount->getCount();
917 CycleSavings += EntryCount / 2;
918 CycleSavings = CycleSavings.udiv(EntryCount);
919
920 // Compute the total savings for the call site.
921 auto *CallerBB = CandidateCall.getParent();
922 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
923 CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
924 CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
925
926 // Remove the cost of the cold basic blocks to model the runtime cost more
927 // accurately. Both machine block placement and function splitting could
928 // place cold blocks further from hot blocks.
929 int Size = Cost - ColdSize;
930
931 // Allow tiny callees to be inlined regardless of whether they meet the
932 // savings threshold.
934
935 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
936 CostBenefit.emplace(APInt(128, Size), CycleSavings);
937
938 // Let R be the ratio of CycleSavings to Size. We accept the inlining
939 // opportunity if R is really high and reject if R is really low. If R is
940 // somewhere in the middle, we fall back to the cost-based analysis.
941 //
942 // Specifically, let R = CycleSavings / Size, we accept the inlining
943 // opportunity if:
944 //
945 // PSI->getOrCompHotCountThreshold()
946 // R > -------------------------------------------------
947 // getInliningCostBenefitAnalysisSavingsMultiplier()
948 //
949 // and reject the inlining opportunity if:
950 //
951 // PSI->getOrCompHotCountThreshold()
952 // R <= ----------------------------------------------------
953 // getInliningCostBenefitAnalysisProfitableMultiplier()
954 //
955 // Otherwise, we fall back to the cost-based analysis.
956 //
957 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
958 // HotCountThreshold * Size) rather than division to avoid precision loss.
959 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
960 Threshold *= Size;
961
962 APInt UpperBoundCycleSavings = CycleSavings;
963 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
964 if (UpperBoundCycleSavings.uge(Threshold))
965 return true;
966
967 APInt LowerBoundCycleSavings = CycleSavings;
968 LowerBoundCycleSavings *=
969 getInliningCostBenefitAnalysisProfitableMultiplier();
970 if (LowerBoundCycleSavings.ult(Threshold))
971 return false;
972
973 // Otherwise, fall back to the cost-based analysis.
974 return std::nullopt;
975 }
976
977 InlineResult finalizeAnalysis() override {
978 // Loops generally act a lot like calls in that they act like barriers to
979 // movement, require a certain amount of setup, etc. So when optimising for
980 // size, we penalise any call sites that perform loops. We do this after all
981 // other costs here, so will likely only be dealing with relatively small
982 // functions (and hence DT and LI will hopefully be cheap).
983 auto *Caller = CandidateCall.getFunction();
984 if (Caller->hasMinSize()) {
985 DominatorTree DT(F);
986 LoopInfo LI(DT);
987 int NumLoops = 0;
988 for (Loop *L : LI) {
989 // Ignore loops that will not be executed
990 if (DeadBlocks.count(L->getHeader()))
991 continue;
992 NumLoops++;
993 }
994 addCost(NumLoops * InlineConstants::LoopPenalty);
995 }
996
997 // We applied the maximum possible vector bonus at the beginning. Now,
998 // subtract the excess bonus, if any, from the Threshold before
999 // comparing against Cost.
1000 if (NumVectorInstructions <= NumInstructions / 10)
1001 Threshold -= VectorBonus;
1002 else if (NumVectorInstructions <= NumInstructions / 2)
1003 Threshold -= VectorBonus / 2;
1004
1005 if (std::optional<int> AttrCost =
1006 getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
1007 Cost = *AttrCost;
1008
1009 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1010 CandidateCall,
1012 Cost *= *AttrCostMult;
1013
1014 if (std::optional<int> AttrThreshold =
1015 getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
1016 Threshold = *AttrThreshold;
1017
1018 if (auto Result = costBenefitAnalysis()) {
1019 DecidedByCostBenefit = true;
1020 if (*Result)
1021 return InlineResult::success();
1022 else
1023 return InlineResult::failure("Cost over threshold.");
1024 }
1025
1026 if (IgnoreThreshold)
1027 return InlineResult::success();
1028
1029 DecidedByCostThreshold = true;
1030 return Cost < std::max(1, Threshold)
1032 : InlineResult::failure("Cost over threshold.");
1033 }
1034
1035 bool shouldStop() override {
1036 if (IgnoreThreshold || ComputeFullInlineCost)
1037 return false;
1038 // Bail out the moment we cross the threshold. This means we'll under-count
1039 // the cost, but only when undercounting doesn't matter.
1040 if (Cost < Threshold)
1041 return false;
1042 DecidedByCostThreshold = true;
1043 return true;
1044 }
1045
1046 void onLoadEliminationOpportunity() override {
1047 LoadEliminationCost += InstrCost;
1048 }
1049
1050 InlineResult onAnalysisStart() override {
1051 // Perform some tweaks to the cost and threshold based on the direct
1052 // callsite information.
1053
1054 // We want to more aggressively inline vector-dense kernels, so up the
1055 // threshold, and we'll lower it if the % of vector instructions gets too
1056 // low. Note that these bonuses are some what arbitrary and evolved over
1057 // time by accident as much as because they are principled bonuses.
1058 //
1059 // FIXME: It would be nice to remove all such bonuses. At least it would be
1060 // nice to base the bonus values on something more scientific.
1061 assert(NumInstructions == 0);
1062 assert(NumVectorInstructions == 0);
1063
1064 // Update the threshold based on callsite properties
1065 updateThreshold(CandidateCall, F);
1066
1067 // While Threshold depends on commandline options that can take negative
1068 // values, we want to enforce the invariant that the computed threshold and
1069 // bonuses are non-negative.
1070 assert(Threshold >= 0);
1071 assert(SingleBBBonus >= 0);
1072 assert(VectorBonus >= 0);
1073
1074 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1075 // this Threshold any time, and cost cannot decrease, we can stop processing
1076 // the rest of the function body.
1077 Threshold += (SingleBBBonus + VectorBonus);
1078
1079 // Give out bonuses for the callsite, as the instructions setting them up
1080 // will be gone after inlining.
1081 addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
1082
1083 // If this function uses the coldcc calling convention, prefer not to inline
1084 // it.
1085 if (F.getCallingConv() == CallingConv::Cold)
1087
1088 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1089
1090 // Check if we're done. This can happen due to bonuses and penalties.
1091 if (Cost >= Threshold && !ComputeFullInlineCost)
1092 return InlineResult::failure("high cost");
1093
1094 return InlineResult::success();
1095 }
1096
1097public:
1098 InlineCostCallAnalyzer(
1099 Function &Callee, CallBase &Call, const InlineParams &Params,
1100 const TargetTransformInfo &TTI,
1101 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1102 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1103 ProfileSummaryInfo *PSI = nullptr,
1104 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1105 bool IgnoreThreshold = false)
1106 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1107 ComputeFullInlineCost(OptComputeFullInlineCost ||
1108 Params.ComputeFullInlineCost || ORE ||
1109 isCostBenefitAnalysisEnabled()),
1110 Params(Params), Threshold(Params.DefaultThreshold),
1111 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1112 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1113 Writer(this) {
1114 AllowRecursiveCall = *Params.AllowRecursiveCall;
1115 }
1116
1117 /// Annotation Writer for instruction details
1118 InlineCostAnnotationWriter Writer;
1119
1120 void dump();
1121
1122 // Prints the same analysis as dump(), but its definition is not dependent
1123 // on the build.
1124 void print(raw_ostream &OS);
1125
1126 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1127 if (InstructionCostDetailMap.contains(I))
1128 return InstructionCostDetailMap[I];
1129 return std::nullopt;
1130 }
1131
1132 virtual ~InlineCostCallAnalyzer() = default;
1133 int getThreshold() const { return Threshold; }
1134 int getCost() const { return Cost; }
1135 int getStaticBonusApplied() const { return StaticBonusApplied; }
1136 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1137 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1138 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1139};
1140
1141// Return true if CB is the sole call to local function Callee.
1142static bool isSoleCallToLocalFunction(const CallBase &CB,
1143 const Function &Callee) {
1144 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1145 &Callee == CB.getCalledFunction();
1146}
1147
1148class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1149private:
1151
1152 // FIXME: These constants are taken from the heuristic-based cost visitor.
1153 // These should be removed entirely in a later revision to avoid reliance on
1154 // heuristics in the ML inliner.
1155 static constexpr int JTCostMultiplier = 4;
1156 static constexpr int CaseClusterCostMultiplier = 2;
1157 static constexpr int SwitchDefaultDestCostMultiplier = 2;
1158 static constexpr int SwitchCostMultiplier = 2;
1159
1160 // FIXME: These are taken from the heuristic-based cost visitor: we should
1161 // eventually abstract these to the CallAnalyzer to avoid duplication.
1162 unsigned SROACostSavingOpportunities = 0;
1163 int VectorBonus = 0;
1164 int SingleBBBonus = 0;
1165 int Threshold = 5;
1166
1168
1169 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1170 Cost[static_cast<size_t>(Feature)] += Delta;
1171 }
1172
1173 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1174 Cost[static_cast<size_t>(Feature)] = Value;
1175 }
1176
1177 void onDisableSROA(AllocaInst *Arg) override {
1178 auto CostIt = SROACosts.find(Arg);
1179 if (CostIt == SROACosts.end())
1180 return;
1181
1182 increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1183 SROACostSavingOpportunities -= CostIt->second;
1184 SROACosts.erase(CostIt);
1185 }
1186
1187 void onDisableLoadElimination() override {
1188 set(InlineCostFeatureIndex::load_elimination, 1);
1189 }
1190
1191 void onCallPenalty() override {
1192 increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1193 }
1194
1195 void onCallArgumentSetup(const CallBase &Call) override {
1196 increment(InlineCostFeatureIndex::call_argument_setup,
1197 Call.arg_size() * InstrCost);
1198 }
1199
1200 void onLoadRelativeIntrinsic() override {
1201 increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1202 }
1203
1204 void onLoweredCall(Function *F, CallBase &Call,
1205 bool IsIndirectCall) override {
1206 increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1207 Call.arg_size() * InstrCost);
1208
1209 if (IsIndirectCall) {
1210 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1211 /*HintThreshold*/ {},
1212 /*ColdThreshold*/ {},
1213 /*OptSizeThreshold*/ {},
1214 /*OptMinSizeThreshold*/ {},
1215 /*HotCallSiteThreshold*/ {},
1216 /*LocallyHotCallSiteThreshold*/ {},
1217 /*ColdCallSiteThreshold*/ {},
1218 /*ComputeFullInlineCost*/ true,
1219 /*EnableDeferral*/ true};
1220 IndirectCallParams.DefaultThreshold =
1222
1223 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1224 GetAssumptionCache, GetBFI, PSI, ORE, false,
1225 true);
1226 if (CA.analyze().isSuccess()) {
1227 increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1228 CA.getCost());
1229 increment(InlineCostFeatureIndex::nested_inlines, 1);
1230 }
1231 } else {
1232 onCallPenalty();
1233 }
1234 }
1235
1236 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1237 bool DefaultDestUndefined) override {
1238 if (!DefaultDestUndefined)
1239 increment(InlineCostFeatureIndex::switch_default_dest_penalty,
1240 SwitchDefaultDestCostMultiplier * InstrCost);
1241
1242 if (JumpTableSize) {
1243 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1244 JTCostMultiplier * InstrCost;
1245 increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1246 return;
1247 }
1248
1249 if (NumCaseCluster <= 3) {
1250 increment(InlineCostFeatureIndex::case_cluster_penalty,
1251 NumCaseCluster * CaseClusterCostMultiplier * InstrCost);
1252 return;
1253 }
1254
1255 int64_t ExpectedNumberOfCompare =
1256 getExpectedNumberOfCompare(NumCaseCluster);
1257
1258 int64_t SwitchCost =
1259 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1260 increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1261 }
1262
1263 void onMissedSimplification() override {
1264 increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1265 InstrCost);
1266 }
1267
1268 void onInitializeSROAArg(AllocaInst *Arg) override {
1269 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1270 SROACosts[Arg] = SROAArgCost;
1271 SROACostSavingOpportunities += SROAArgCost;
1272 }
1273
1274 void onAggregateSROAUse(AllocaInst *Arg) override {
1275 SROACosts.find(Arg)->second += InstrCost;
1276 SROACostSavingOpportunities += InstrCost;
1277 }
1278
1279 void onBlockAnalyzed(const BasicBlock *BB) override {
1280 if (BB->getTerminator()->getNumSuccessors() > 1)
1281 set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1282 Threshold -= SingleBBBonus;
1283 }
1284
1285 InlineResult finalizeAnalysis() override {
1286 auto *Caller = CandidateCall.getFunction();
1287 if (Caller->hasMinSize()) {
1288 DominatorTree DT(F);
1289 LoopInfo LI(DT);
1290 for (Loop *L : LI) {
1291 // Ignore loops that will not be executed
1292 if (DeadBlocks.count(L->getHeader()))
1293 continue;
1294 increment(InlineCostFeatureIndex::num_loops,
1296 }
1297 }
1298 set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1299 set(InlineCostFeatureIndex::simplified_instructions,
1300 NumInstructionsSimplified);
1301 set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1302 set(InlineCostFeatureIndex::constant_offset_ptr_args,
1303 NumConstantOffsetPtrArgs);
1304 set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1305
1306 if (NumVectorInstructions <= NumInstructions / 10)
1307 Threshold -= VectorBonus;
1308 else if (NumVectorInstructions <= NumInstructions / 2)
1309 Threshold -= VectorBonus / 2;
1310
1311 set(InlineCostFeatureIndex::threshold, Threshold);
1312
1313 return InlineResult::success();
1314 }
1315
1316 bool shouldStop() override { return false; }
1317
1318 void onLoadEliminationOpportunity() override {
1319 increment(InlineCostFeatureIndex::load_elimination, 1);
1320 }
1321
1322 InlineResult onAnalysisStart() override {
1323 increment(InlineCostFeatureIndex::callsite_cost,
1324 -1 * getCallsiteCost(TTI, this->CandidateCall, DL));
1325
1326 set(InlineCostFeatureIndex::cold_cc_penalty,
1327 (F.getCallingConv() == CallingConv::Cold));
1328
1329 set(InlineCostFeatureIndex::last_call_to_static_bonus,
1330 isSoleCallToLocalFunction(CandidateCall, F));
1331
1332 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1333 // analyzer - instead, we should abstract it to a common method in the
1334 // CallAnalyzer
1335 int SingleBBBonusPercent = 50;
1336 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1337 Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1338 Threshold *= TTI.getInliningThresholdMultiplier();
1339 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1340 VectorBonus = Threshold * VectorBonusPercent / 100;
1341 Threshold += (SingleBBBonus + VectorBonus);
1342
1343 return InlineResult::success();
1344 }
1345
1346public:
1347 InlineCostFeaturesAnalyzer(
1348 const TargetTransformInfo &TTI,
1349 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1352 CallBase &Call)
1353 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1354
1355 const InlineCostFeatures &features() const { return Cost; }
1356};
1357
1358} // namespace
1359
1360/// Test whether the given value is an Alloca-derived function argument.
1361bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1362 return SROAArgValues.count(V);
1363}
1364
1365void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1366 onDisableSROA(SROAArg);
1367 EnabledSROAAllocas.erase(SROAArg);
1368 disableLoadElimination();
1369}
1370
1371void InlineCostAnnotationWriter::emitInstructionAnnot(
1373 // The cost of inlining of the given instruction is printed always.
1374 // The threshold delta is printed only when it is non-zero. It happens
1375 // when we decided to give a bonus at a particular instruction.
1376 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1377 if (!Record)
1378 OS << "; No analysis for the instruction";
1379 else {
1380 OS << "; cost before = " << Record->CostBefore
1381 << ", cost after = " << Record->CostAfter
1382 << ", threshold before = " << Record->ThresholdBefore
1383 << ", threshold after = " << Record->ThresholdAfter << ", ";
1384 OS << "cost delta = " << Record->getCostDelta();
1385 if (Record->hasThresholdChanged())
1386 OS << ", threshold delta = " << Record->getThresholdDelta();
1387 }
1388 auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1389 if (C) {
1390 OS << ", simplified to ";
1391 (*C)->print(OS, true);
1392 }
1393 OS << "\n";
1394}
1395
1396/// If 'V' maps to a SROA candidate, disable SROA for it.
1397void CallAnalyzer::disableSROA(Value *V) {
1398 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1399 disableSROAForArg(SROAArg);
1400 }
1401}
1402
1403void CallAnalyzer::disableLoadElimination() {
1404 if (EnableLoadElimination) {
1405 onDisableLoadElimination();
1406 EnableLoadElimination = false;
1407 }
1408}
1409
1410/// Accumulate a constant GEP offset into an APInt if possible.
1411///
1412/// Returns false if unable to compute the offset for any reason. Respects any
1413/// simplified values known during the analysis of this callsite.
1414bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1415 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1416 assert(IntPtrWidth == Offset.getBitWidth());
1417
1419 GTI != GTE; ++GTI) {
1420 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1421 if (!OpC)
1422 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
1423 OpC = dyn_cast<ConstantInt>(SimpleOp);
1424 if (!OpC)
1425 return false;
1426 if (OpC->isZero())
1427 continue;
1428
1429 // Handle a struct index, which adds its field offset to the pointer.
1430 if (StructType *STy = GTI.getStructTypeOrNull()) {
1431 unsigned ElementIdx = OpC->getZExtValue();
1432 const StructLayout *SL = DL.getStructLayout(STy);
1433 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1434 continue;
1435 }
1436
1437 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1438 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1439 }
1440 return true;
1441}
1442
1443/// Use TTI to check whether a GEP is free.
1444///
1445/// Respects any simplified values known during the analysis of this callsite.
1446bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1448 Operands.push_back(GEP.getOperand(0));
1449 for (const Use &Op : GEP.indices())
1450 if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1451 Operands.push_back(SimpleOp);
1452 else
1453 Operands.push_back(Op);
1457}
1458
1459bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1460 disableSROA(I.getOperand(0));
1461
1462 // Check whether inlining will turn a dynamic alloca into a static
1463 // alloca and handle that case.
1464 if (I.isArrayAllocation()) {
1465 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1466 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1467 // Sometimes a dynamic alloca could be converted into a static alloca
1468 // after this constant prop, and become a huge static alloca on an
1469 // unconditional CFG path. Avoid inlining if this is going to happen above
1470 // a threshold.
1471 // FIXME: If the threshold is removed or lowered too much, we could end up
1472 // being too pessimistic and prevent inlining non-problematic code. This
1473 // could result in unintended perf regressions. A better overall strategy
1474 // is needed to track stack usage during inlining.
1475 Type *Ty = I.getAllocatedType();
1476 AllocatedSize = SaturatingMultiplyAdd(
1477 AllocSize->getLimitedValue(),
1478 DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1480 HasDynamicAlloca = true;
1481 return false;
1482 }
1483 }
1484
1485 // Accumulate the allocated size.
1486 if (I.isStaticAlloca()) {
1487 Type *Ty = I.getAllocatedType();
1488 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1489 AllocatedSize);
1490 }
1491
1492 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1493 // a variety of reasons, and so we would like to not inline them into
1494 // functions which don't currently have a dynamic alloca. This simply
1495 // disables inlining altogether in the presence of a dynamic alloca.
1496 if (!I.isStaticAlloca())
1497 HasDynamicAlloca = true;
1498
1499 return false;
1500}
1501
1502bool CallAnalyzer::visitPHI(PHINode &I) {
1503 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1504 // though we don't want to propagate it's bonuses. The idea is to disable
1505 // SROA if it *might* be used in an inappropriate manner.
1506
1507 // Phi nodes are always zero-cost.
1508 // FIXME: Pointer sizes may differ between different address spaces, so do we
1509 // need to use correct address space in the call to getPointerSizeInBits here?
1510 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1511 // see the ZeroOffset is used as a dummy value, so we can probably use any
1512 // bit width for the ZeroOffset?
1513 APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1514 bool CheckSROA = I.getType()->isPointerTy();
1515
1516 // Track the constant or pointer with constant offset we've seen so far.
1517 Constant *FirstC = nullptr;
1518 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1519 Value *FirstV = nullptr;
1520
1521 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1522 BasicBlock *Pred = I.getIncomingBlock(i);
1523 // If the incoming block is dead, skip the incoming block.
1524 if (DeadBlocks.count(Pred))
1525 continue;
1526 // If the parent block of phi is not the known successor of the incoming
1527 // block, skip the incoming block.
1528 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1529 if (KnownSuccessor && KnownSuccessor != I.getParent())
1530 continue;
1531
1532 Value *V = I.getIncomingValue(i);
1533 // If the incoming value is this phi itself, skip the incoming value.
1534 if (&I == V)
1535 continue;
1536
1537 Constant *C = dyn_cast<Constant>(V);
1538 if (!C)
1539 C = SimplifiedValues.lookup(V);
1540
1541 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1542 if (!C && CheckSROA)
1543 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1544
1545 if (!C && !BaseAndOffset.first)
1546 // The incoming value is neither a constant nor a pointer with constant
1547 // offset, exit early.
1548 return true;
1549
1550 if (FirstC) {
1551 if (FirstC == C)
1552 // If we've seen a constant incoming value before and it is the same
1553 // constant we see this time, continue checking the next incoming value.
1554 continue;
1555 // Otherwise early exit because we either see a different constant or saw
1556 // a constant before but we have a pointer with constant offset this time.
1557 return true;
1558 }
1559
1560 if (FirstV) {
1561 // The same logic as above, but check pointer with constant offset here.
1562 if (FirstBaseAndOffset == BaseAndOffset)
1563 continue;
1564 return true;
1565 }
1566
1567 if (C) {
1568 // This is the 1st time we've seen a constant, record it.
1569 FirstC = C;
1570 continue;
1571 }
1572
1573 // The remaining case is that this is the 1st time we've seen a pointer with
1574 // constant offset, record it.
1575 FirstV = V;
1576 FirstBaseAndOffset = BaseAndOffset;
1577 }
1578
1579 // Check if we can map phi to a constant.
1580 if (FirstC) {
1581 SimplifiedValues[&I] = FirstC;
1582 return true;
1583 }
1584
1585 // Check if we can map phi to a pointer with constant offset.
1586 if (FirstBaseAndOffset.first) {
1587 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1588
1589 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1590 SROAArgValues[&I] = SROAArg;
1591 }
1592
1593 return true;
1594}
1595
1596/// Check we can fold GEPs of constant-offset call site argument pointers.
1597/// This requires target data and inbounds GEPs.
1598///
1599/// \return true if the specified GEP can be folded.
1600bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1601 // Check if we have a base + offset for the pointer.
1602 std::pair<Value *, APInt> BaseAndOffset =
1603 ConstantOffsetPtrs.lookup(I.getPointerOperand());
1604 if (!BaseAndOffset.first)
1605 return false;
1606
1607 // Check if the offset of this GEP is constant, and if so accumulate it
1608 // into Offset.
1609 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1610 return false;
1611
1612 // Add the result as a new mapping to Base + Offset.
1613 ConstantOffsetPtrs[&I] = BaseAndOffset;
1614
1615 return true;
1616}
1617
1618bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1619 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1620
1621 // Lambda to check whether a GEP's indices are all constant.
1622 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1623 for (const Use &Op : GEP.indices())
1624 if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
1625 return false;
1626 return true;
1627 };
1628
1631 return true;
1632
1633 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1634 if (SROAArg)
1635 SROAArgValues[&I] = SROAArg;
1636
1637 // Constant GEPs are modeled as free.
1638 return true;
1639 }
1640
1641 // Variable GEPs will require math and will disable SROA.
1642 if (SROAArg)
1643 disableSROAForArg(SROAArg);
1644 return isGEPFree(I);
1645}
1646
1647/// Simplify \p I if its operands are constants and update SimplifiedValues.
1648bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1650 for (Value *Op : I.operands()) {
1651 Constant *COp = dyn_cast<Constant>(Op);
1652 if (!COp)
1653 COp = SimplifiedValues.lookup(Op);
1654 if (!COp)
1655 return false;
1656 COps.push_back(COp);
1657 }
1658 auto *C = ConstantFoldInstOperands(&I, COps, DL);
1659 if (!C)
1660 return false;
1661 SimplifiedValues[&I] = C;
1662 return true;
1663}
1664
1665/// Try to simplify a call to llvm.is.constant.
1666///
1667/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1668/// we expect calls of this specific intrinsic to be infrequent.
1669///
1670/// FIXME: Given that we know CB's parent (F) caller
1671/// (CandidateCall->getParent()->getParent()), we might be able to determine
1672/// whether inlining F into F's caller would change how the call to
1673/// llvm.is.constant would evaluate.
1674bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1675 Value *Arg = CB.getArgOperand(0);
1676 auto *C = dyn_cast<Constant>(Arg);
1677
1678 if (!C)
1679 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
1680
1681 Type *RT = CB.getFunctionType()->getReturnType();
1682 SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1683 return true;
1684}
1685
1686bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1687 // As per the langref, "The fourth argument to llvm.objectsize determines if
1688 // the value should be evaluated at runtime."
1689 if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1690 return false;
1691
1692 Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1693 /*MustSucceed=*/true);
1694 Constant *C = dyn_cast_or_null<Constant>(V);
1695 if (C)
1696 SimplifiedValues[&CB] = C;
1697 return C;
1698}
1699
1700bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1701 // Propagate constants through bitcasts.
1703 return true;
1704
1705 // Track base/offsets through casts
1706 std::pair<Value *, APInt> BaseAndOffset =
1707 ConstantOffsetPtrs.lookup(I.getOperand(0));
1708 // Casts don't change the offset, just wrap it up.
1709 if (BaseAndOffset.first)
1710 ConstantOffsetPtrs[&I] = BaseAndOffset;
1711
1712 // Also look for SROA candidates here.
1713 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1714 SROAArgValues[&I] = SROAArg;
1715
1716 // Bitcasts are always zero cost.
1717 return true;
1718}
1719
1720bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1721 // Propagate constants through ptrtoint.
1723 return true;
1724
1725 // Track base/offset pairs when converted to a plain integer provided the
1726 // integer is large enough to represent the pointer.
1727 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1728 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1729 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1730 std::pair<Value *, APInt> BaseAndOffset =
1731 ConstantOffsetPtrs.lookup(I.getOperand(0));
1732 if (BaseAndOffset.first)
1733 ConstantOffsetPtrs[&I] = BaseAndOffset;
1734 }
1735
1736 // This is really weird. Technically, ptrtoint will disable SROA. However,
1737 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1738 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1739 // would block SROA would also block SROA if applied directly to a pointer,
1740 // and so we can just add the integer in here. The only places where SROA is
1741 // preserved either cannot fire on an integer, or won't in-and-of themselves
1742 // disable SROA (ext) w/o some later use that we would see and disable.
1743 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1744 SROAArgValues[&I] = SROAArg;
1745
1748}
1749
1750bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1751 // Propagate constants through ptrtoint.
1753 return true;
1754
1755 // Track base/offset pairs when round-tripped through a pointer without
1756 // modifications provided the integer is not too large.
1757 Value *Op = I.getOperand(0);
1758 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1759 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1760 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1761 if (BaseAndOffset.first)
1762 ConstantOffsetPtrs[&I] = BaseAndOffset;
1763 }
1764
1765 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1766 if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1767 SROAArgValues[&I] = SROAArg;
1768
1771}
1772
1773bool CallAnalyzer::visitCastInst(CastInst &I) {
1774 // Propagate constants through casts.
1776 return true;
1777
1778 // Disable SROA in the face of arbitrary casts we don't explicitly list
1779 // elsewhere.
1780 disableSROA(I.getOperand(0));
1781
1782 // If this is a floating-point cast, and the target says this operation
1783 // is expensive, this may eventually become a library call. Treat the cost
1784 // as such.
1785 switch (I.getOpcode()) {
1786 case Instruction::FPTrunc:
1787 case Instruction::FPExt:
1788 case Instruction::UIToFP:
1789 case Instruction::SIToFP:
1790 case Instruction::FPToUI:
1791 case Instruction::FPToSI:
1793 onCallPenalty();
1794 break;
1795 default:
1796 break;
1797 }
1798
1801}
1802
1803bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1804 return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1805}
1806
1807bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1808 // Does the *call site* have the NonNull attribute set on an argument? We
1809 // use the attribute on the call site to memoize any analysis done in the
1810 // caller. This will also trip if the callee function has a non-null
1811 // parameter attribute, but that's a less interesting case because hopefully
1812 // the callee would already have been simplified based on that.
1813 if (Argument *A = dyn_cast<Argument>(V))
1814 if (paramHasAttr(A, Attribute::NonNull))
1815 return true;
1816
1817 // Is this an alloca in the caller? This is distinct from the attribute case
1818 // above because attributes aren't updated within the inliner itself and we
1819 // always want to catch the alloca derived case.
1820 if (isAllocaDerivedArg(V))
1821 // We can actually predict the result of comparisons between an
1822 // alloca-derived value and null. Note that this fires regardless of
1823 // SROA firing.
1824 return true;
1825
1826 return false;
1827}
1828
1829bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1830 // If the normal destination of the invoke or the parent block of the call
1831 // site is unreachable-terminated, there is little point in inlining this
1832 // unless there is literally zero cost.
1833 // FIXME: Note that it is possible that an unreachable-terminated block has a
1834 // hot entry. For example, in below scenario inlining hot_call_X() may be
1835 // beneficial :
1836 // main() {
1837 // hot_call_1();
1838 // ...
1839 // hot_call_N()
1840 // exit(0);
1841 // }
1842 // For now, we are not handling this corner case here as it is rare in real
1843 // code. In future, we should elaborate this based on BPI and BFI in more
1844 // general threshold adjusting heuristics in updateThreshold().
1845 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1846 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1847 return false;
1848 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1849 return false;
1850
1851 return true;
1852}
1853
1854bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1855 BlockFrequencyInfo *CallerBFI) {
1856 // If global profile summary is available, then callsite's coldness is
1857 // determined based on that.
1858 if (PSI && PSI->hasProfileSummary())
1859 return PSI->isColdCallSite(Call, CallerBFI);
1860
1861 // Otherwise we need BFI to be available.
1862 if (!CallerBFI)
1863 return false;
1864
1865 // Determine if the callsite is cold relative to caller's entry. We could
1866 // potentially cache the computation of scaled entry frequency, but the added
1867 // complexity is not worth it unless this scaling shows up high in the
1868 // profiles.
1869 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1870 auto CallSiteBB = Call.getParent();
1871 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1872 auto CallerEntryFreq =
1873 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1874 return CallSiteFreq < CallerEntryFreq * ColdProb;
1875}
1876
1877std::optional<int>
1878InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1879 BlockFrequencyInfo *CallerBFI) {
1880
1881 // If global profile summary is available, then callsite's hotness is
1882 // determined based on that.
1883 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1884 return Params.HotCallSiteThreshold;
1885
1886 // Otherwise we need BFI to be available and to have a locally hot callsite
1887 // threshold.
1888 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1889 return std::nullopt;
1890
1891 // Determine if the callsite is hot relative to caller's entry. We could
1892 // potentially cache the computation of scaled entry frequency, but the added
1893 // complexity is not worth it unless this scaling shows up high in the
1894 // profiles.
1895 const BasicBlock *CallSiteBB = Call.getParent();
1896 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1897 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
1898 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
1899 if (Limit && CallSiteFreq >= *Limit)
1900 return Params.LocallyHotCallSiteThreshold;
1901
1902 // Otherwise treat it normally.
1903 return std::nullopt;
1904}
1905
1906void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1907 // If no size growth is allowed for this inlining, set Threshold to 0.
1908 if (!allowSizeGrowth(Call)) {
1909 Threshold = 0;
1910 return;
1911 }
1912
1913 Function *Caller = Call.getCaller();
1914
1915 // return min(A, B) if B is valid.
1916 auto MinIfValid = [](int A, std::optional<int> B) {
1917 return B ? std::min(A, *B) : A;
1918 };
1919
1920 // return max(A, B) if B is valid.
1921 auto MaxIfValid = [](int A, std::optional<int> B) {
1922 return B ? std::max(A, *B) : A;
1923 };
1924
1925 // Various bonus percentages. These are multiplied by Threshold to get the
1926 // bonus values.
1927 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1928 // basic block at the given callsite context. This is speculatively applied
1929 // and withdrawn if more than one basic block is seen.
1930 //
1931 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1932 // of the last call to a static function as inlining such functions is
1933 // guaranteed to reduce code size.
1934 //
1935 // These bonus percentages may be set to 0 based on properties of the caller
1936 // and the callsite.
1937 int SingleBBBonusPercent = 50;
1938 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1940
1941 // Lambda to set all the above bonus and bonus percentages to 0.
1942 auto DisallowAllBonuses = [&]() {
1943 SingleBBBonusPercent = 0;
1944 VectorBonusPercent = 0;
1946 };
1947
1948 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1949 // and reduce the threshold if the caller has the necessary attribute.
1950 if (Caller->hasMinSize()) {
1951 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1952 // For minsize, we want to disable the single BB bonus and the vector
1953 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1954 // a static function will, at the minimum, eliminate the parameter setup and
1955 // call/return instructions.
1956 SingleBBBonusPercent = 0;
1957 VectorBonusPercent = 0;
1958 } else if (Caller->hasOptSize())
1959 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1960
1961 // Adjust the threshold based on inlinehint attribute and profile based
1962 // hotness information if the caller does not have MinSize attribute.
1963 if (!Caller->hasMinSize()) {
1964 if (Callee.hasFnAttribute(Attribute::InlineHint))
1965 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1966
1967 // FIXME: After switching to the new passmanager, simplify the logic below
1968 // by checking only the callsite hotness/coldness as we will reliably
1969 // have local profile information.
1970 //
1971 // Callsite hotness and coldness can be determined if sample profile is
1972 // used (which adds hotness metadata to calls) or if caller's
1973 // BlockFrequencyInfo is available.
1974 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1975 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1976 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1977 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1978 // FIXME: This should update the threshold only if it exceeds the
1979 // current threshold, but AutoFDO + ThinLTO currently relies on this
1980 // behavior to prevent inlining of hot callsites during ThinLTO
1981 // compile phase.
1982 Threshold = *HotCallSiteThreshold;
1983 } else if (isColdCallSite(Call, CallerBFI)) {
1984 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1985 // Do not apply bonuses for a cold callsite including the
1986 // LastCallToStatic bonus. While this bonus might result in code size
1987 // reduction, it can cause the size of a non-cold caller to increase
1988 // preventing it from being inlined.
1989 DisallowAllBonuses();
1990 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1991 } else if (PSI) {
1992 // Use callee's global profile information only if we have no way of
1993 // determining this via callsite information.
1994 if (PSI->isFunctionEntryHot(&Callee)) {
1995 LLVM_DEBUG(dbgs() << "Hot callee.\n");
1996 // If callsite hotness can not be determined, we may still know
1997 // that the callee is hot and treat it as a weaker hint for threshold
1998 // increase.
1999 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2000 } else if (PSI->isFunctionEntryCold(&Callee)) {
2001 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2002 // Do not apply bonuses for a cold callee including the
2003 // LastCallToStatic bonus. While this bonus might result in code size
2004 // reduction, it can cause the size of a non-cold caller to increase
2005 // preventing it from being inlined.
2006 DisallowAllBonuses();
2007 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2008 }
2009 }
2010 }
2011
2012 Threshold += TTI.adjustInliningThreshold(&Call);
2013
2014 // Finally, take the target-specific inlining threshold multiplier into
2015 // account.
2016 Threshold *= TTI.getInliningThresholdMultiplier();
2017
2018 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2019 VectorBonus = Threshold * VectorBonusPercent / 100;
2020
2021 // If there is only one call of the function, and it has internal linkage,
2022 // the cost of inlining it drops dramatically. It may seem odd to update
2023 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2024 if (isSoleCallToLocalFunction(Call, F)) {
2026 StaticBonusApplied = LastCallToStaticBonus;
2027 }
2028}
2029
2030bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2031 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2032 // First try to handle simplified comparisons.
2034 return true;
2035
2036 if (I.getOpcode() == Instruction::FCmp)
2037 return false;
2038
2039 // Otherwise look for a comparison between constant offset pointers with
2040 // a common base.
2041 Value *LHSBase, *RHSBase;
2042 APInt LHSOffset, RHSOffset;
2043 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2044 if (LHSBase) {
2045 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2046 if (RHSBase && LHSBase == RHSBase) {
2047 // We have common bases, fold the icmp to a constant based on the
2048 // offsets.
2049 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2050 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2051 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
2052 SimplifiedValues[&I] = C;
2053 ++NumConstantPtrCmps;
2054 return true;
2055 }
2056 }
2057 }
2058
2059 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2060 for (auto *User : I.users())
2061 if (auto *Instr = dyn_cast<Instruction>(User))
2062 if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
2063 return false;
2064 return true;
2065 };
2066
2067 // If the comparison is an equality comparison with null, we can simplify it
2068 // if we know the value (argument) can't be null
2069 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
2070 if (isKnownNonNullInCallee(I.getOperand(0))) {
2071 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2072 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2074 return true;
2075 }
2076 // Implicit null checks act as unconditional branches and their comparisons
2077 // should be treated as simplified and free of cost.
2078 if (isImplicitNullCheckCmp(I))
2079 return true;
2080 }
2081 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2082}
2083
2084bool CallAnalyzer::visitSub(BinaryOperator &I) {
2085 // Try to handle a special case: we can fold computing the difference of two
2086 // constant-related pointers.
2087 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2088 Value *LHSBase, *RHSBase;
2089 APInt LHSOffset, RHSOffset;
2090 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2091 if (LHSBase) {
2092 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2093 if (RHSBase && LHSBase == RHSBase) {
2094 // We have common bases, fold the subtract to a constant based on the
2095 // offsets.
2096 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2097 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2098 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2099 SimplifiedValues[&I] = C;
2100 ++NumConstantPtrDiffs;
2101 return true;
2102 }
2103 }
2104 }
2105
2106 // Otherwise, fall back to the generic logic for simplifying and handling
2107 // instructions.
2108 return Base::visitSub(I);
2109}
2110
2111bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2112 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2113 Constant *CLHS = dyn_cast<Constant>(LHS);
2114 if (!CLHS)
2115 CLHS = SimplifiedValues.lookup(LHS);
2116 Constant *CRHS = dyn_cast<Constant>(RHS);
2117 if (!CRHS)
2118 CRHS = SimplifiedValues.lookup(RHS);
2119
2120 Value *SimpleV = nullptr;
2121 if (auto FI = dyn_cast<FPMathOperator>(&I))
2122 SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2123 FI->getFastMathFlags(), DL);
2124 else
2125 SimpleV =
2126 simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2127
2128 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2129 SimplifiedValues[&I] = C;
2130
2131 if (SimpleV)
2132 return true;
2133
2134 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2135 disableSROA(LHS);
2136 disableSROA(RHS);
2137
2138 // If the instruction is floating point, and the target says this operation
2139 // is expensive, this may eventually become a library call. Treat the cost
2140 // as such. Unless it's fneg which can be implemented with an xor.
2141 using namespace llvm::PatternMatch;
2142 if (I.getType()->isFloatingPointTy() &&
2144 !match(&I, m_FNeg(m_Value())))
2145 onCallPenalty();
2146
2147 return false;
2148}
2149
2150bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2151 Value *Op = I.getOperand(0);
2152 Constant *COp = dyn_cast<Constant>(Op);
2153 if (!COp)
2154 COp = SimplifiedValues.lookup(Op);
2155
2156 Value *SimpleV = simplifyFNegInst(
2157 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2158
2159 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2160 SimplifiedValues[&I] = C;
2161
2162 if (SimpleV)
2163 return true;
2164
2165 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2166 disableSROA(Op);
2167
2168 return false;
2169}
2170
2171bool CallAnalyzer::visitLoad(LoadInst &I) {
2172 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2173 return true;
2174
2175 // If the data is already loaded from this address and hasn't been clobbered
2176 // by any stores or calls, this load is likely to be redundant and can be
2177 // eliminated.
2178 if (EnableLoadElimination &&
2179 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2180 onLoadEliminationOpportunity();
2181 return true;
2182 }
2183
2184 onMemAccess();
2185 return false;
2186}
2187
2188bool CallAnalyzer::visitStore(StoreInst &I) {
2189 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2190 return true;
2191
2192 // The store can potentially clobber loads and prevent repeated loads from
2193 // being eliminated.
2194 // FIXME:
2195 // 1. We can probably keep an initial set of eliminatable loads substracted
2196 // from the cost even when we finally see a store. We just need to disable
2197 // *further* accumulation of elimination savings.
2198 // 2. We should probably at some point thread MemorySSA for the callee into
2199 // this and then use that to actually compute *really* precise savings.
2200 disableLoadElimination();
2201
2202 onMemAccess();
2203 return false;
2204}
2205
2206bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2207 // Constant folding for extract value is trivial.
2209 return true;
2210
2211 // SROA can't look through these, but they may be free.
2212 return Base::visitExtractValue(I);
2213}
2214
2215bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2216 // Constant folding for insert value is trivial.
2218 return true;
2219
2220 // SROA can't look through these, but they may be free.
2221 return Base::visitInsertValue(I);
2222}
2223
2224/// Try to simplify a call site.
2225///
2226/// Takes a concrete function and callsite and tries to actually simplify it by
2227/// analyzing the arguments and call itself with instsimplify. Returns true if
2228/// it has simplified the callsite to some other entity (a constant), making it
2229/// free.
2230bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2231 // FIXME: Using the instsimplify logic directly for this is inefficient
2232 // because we have to continually rebuild the argument list even when no
2233 // simplifications can be performed. Until that is fixed with remapping
2234 // inside of instsimplify, directly constant fold calls here.
2235 if (!canConstantFoldCallTo(&Call, F))
2236 return false;
2237
2238 // Try to re-map the arguments to constants.
2239 SmallVector<Constant *, 4> ConstantArgs;
2240 ConstantArgs.reserve(Call.arg_size());
2241 for (Value *I : Call.args()) {
2242 Constant *C = dyn_cast<Constant>(I);
2243 if (!C)
2244 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
2245 if (!C)
2246 return false; // This argument doesn't map to a constant.
2247
2248 ConstantArgs.push_back(C);
2249 }
2250 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2251 SimplifiedValues[&Call] = C;
2252 return true;
2253 }
2254
2255 return false;
2256}
2257
2258bool CallAnalyzer::visitCallBase(CallBase &Call) {
2259 if (!onCallBaseVisitStart(Call))
2260 return true;
2261
2262 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2263 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2264 // This aborts the entire analysis.
2265 ExposesReturnsTwice = true;
2266 return false;
2267 }
2268 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2269 ContainsNoDuplicateCall = true;
2270
2271 Function *F = Call.getCalledFunction();
2272 bool IsIndirectCall = !F;
2273 if (IsIndirectCall) {
2274 // Check if this happens to be an indirect function call to a known function
2275 // in this inline context. If not, we've done all we can.
2276 Value *Callee = Call.getCalledOperand();
2277 F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2278 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2279 onCallArgumentSetup(Call);
2280
2281 if (!Call.onlyReadsMemory())
2282 disableLoadElimination();
2283 return Base::visitCallBase(Call);
2284 }
2285 }
2286
2287 assert(F && "Expected a call to a known function");
2288
2289 // When we have a concrete function, first try to simplify it directly.
2290 if (simplifyCallSite(F, Call))
2291 return true;
2292
2293 // Next check if it is an intrinsic we know about.
2294 // FIXME: Lift this into part of the InstVisitor.
2295 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2296 switch (II->getIntrinsicID()) {
2297 default:
2298 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2299 disableLoadElimination();
2300 return Base::visitCallBase(Call);
2301
2302 case Intrinsic::load_relative:
2303 onLoadRelativeIntrinsic();
2304 return false;
2305
2306 case Intrinsic::memset:
2307 case Intrinsic::memcpy:
2308 case Intrinsic::memmove:
2309 disableLoadElimination();
2310 // SROA can usually chew through these intrinsics, but they aren't free.
2311 return false;
2312 case Intrinsic::icall_branch_funnel:
2313 case Intrinsic::localescape:
2314 HasUninlineableIntrinsic = true;
2315 return false;
2316 case Intrinsic::vastart:
2317 InitsVargArgs = true;
2318 return false;
2319 case Intrinsic::launder_invariant_group:
2320 case Intrinsic::strip_invariant_group:
2321 if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2322 SROAArgValues[II] = SROAArg;
2323 return true;
2324 case Intrinsic::is_constant:
2325 return simplifyIntrinsicCallIsConstant(Call);
2326 case Intrinsic::objectsize:
2327 return simplifyIntrinsicCallObjectSize(Call);
2328 }
2329 }
2330
2331 if (F == Call.getFunction()) {
2332 // This flag will fully abort the analysis, so don't bother with anything
2333 // else.
2334 IsRecursiveCall = true;
2335 if (!AllowRecursiveCall)
2336 return false;
2337 }
2338
2339 if (TTI.isLoweredToCall(F)) {
2340 onLoweredCall(F, Call, IsIndirectCall);
2341 }
2342
2343 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2344 disableLoadElimination();
2345 return Base::visitCallBase(Call);
2346}
2347
2348bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2349 // At least one return instruction will be free after inlining.
2350 bool Free = !HasReturn;
2351 HasReturn = true;
2352 return Free;
2353}
2354
2355bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2356 // We model unconditional branches as essentially free -- they really
2357 // shouldn't exist at all, but handling them makes the behavior of the
2358 // inliner more regular and predictable. Interestingly, conditional branches
2359 // which will fold away are also free.
2360 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
2361 BI.getMetadata(LLVMContext::MD_make_implicit) ||
2362 isa_and_nonnull<ConstantInt>(
2363 SimplifiedValues.lookup(BI.getCondition()));
2364}
2365
2366bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2367 bool CheckSROA = SI.getType()->isPointerTy();
2368 Value *TrueVal = SI.getTrueValue();
2369 Value *FalseVal = SI.getFalseValue();
2370
2371 Constant *TrueC = dyn_cast<Constant>(TrueVal);
2372 if (!TrueC)
2373 TrueC = SimplifiedValues.lookup(TrueVal);
2374 Constant *FalseC = dyn_cast<Constant>(FalseVal);
2375 if (!FalseC)
2376 FalseC = SimplifiedValues.lookup(FalseVal);
2377 Constant *CondC =
2378 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2379
2380 if (!CondC) {
2381 // Select C, X, X => X
2382 if (TrueC == FalseC && TrueC) {
2383 SimplifiedValues[&SI] = TrueC;
2384 return true;
2385 }
2386
2387 if (!CheckSROA)
2388 return Base::visitSelectInst(SI);
2389
2390 std::pair<Value *, APInt> TrueBaseAndOffset =
2391 ConstantOffsetPtrs.lookup(TrueVal);
2392 std::pair<Value *, APInt> FalseBaseAndOffset =
2393 ConstantOffsetPtrs.lookup(FalseVal);
2394 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2395 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2396
2397 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2398 SROAArgValues[&SI] = SROAArg;
2399 return true;
2400 }
2401
2402 return Base::visitSelectInst(SI);
2403 }
2404
2405 // Select condition is a constant.
2406 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2407 : (CondC->isNullValue()) ? FalseVal
2408 : nullptr;
2409 if (!SelectedV) {
2410 // Condition is a vector constant that is not all 1s or all 0s. If all
2411 // operands are constants, ConstantFoldSelectInstruction() can handle the
2412 // cases such as select vectors.
2413 if (TrueC && FalseC) {
2414 if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2415 SimplifiedValues[&SI] = C;
2416 return true;
2417 }
2418 }
2419 return Base::visitSelectInst(SI);
2420 }
2421
2422 // Condition is either all 1s or all 0s. SI can be simplified.
2423 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2424 SimplifiedValues[&SI] = SelectedC;
2425 return true;
2426 }
2427
2428 if (!CheckSROA)
2429 return true;
2430
2431 std::pair<Value *, APInt> BaseAndOffset =
2432 ConstantOffsetPtrs.lookup(SelectedV);
2433 if (BaseAndOffset.first) {
2434 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2435
2436 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2437 SROAArgValues[&SI] = SROAArg;
2438 }
2439
2440 return true;
2441}
2442
2443bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2444 // We model unconditional switches as free, see the comments on handling
2445 // branches.
2446 if (isa<ConstantInt>(SI.getCondition()))
2447 return true;
2448 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
2449 if (isa<ConstantInt>(V))
2450 return true;
2451
2452 // Assume the most general case where the switch is lowered into
2453 // either a jump table, bit test, or a balanced binary tree consisting of
2454 // case clusters without merging adjacent clusters with the same
2455 // destination. We do not consider the switches that are lowered with a mix
2456 // of jump table/bit test/binary search tree. The cost of the switch is
2457 // proportional to the size of the tree or the size of jump table range.
2458 //
2459 // NB: We convert large switches which are just used to initialize large phi
2460 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2461 // inlining those. It will prevent inlining in cases where the optimization
2462 // does not (yet) fire.
2463
2464 unsigned JumpTableSize = 0;
2465 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2466 unsigned NumCaseCluster =
2467 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2468
2469 onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUndefined());
2470 return false;
2471}
2472
2473bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2474 // We never want to inline functions that contain an indirectbr. This is
2475 // incorrect because all the blockaddress's (in static global initializers
2476 // for example) would be referring to the original function, and this
2477 // indirect jump would jump from the inlined copy of the function into the
2478 // original function which is extremely undefined behavior.
2479 // FIXME: This logic isn't really right; we can safely inline functions with
2480 // indirectbr's as long as no other function or global references the
2481 // blockaddress of a block within the current function.
2482 HasIndirectBr = true;
2483 return false;
2484}
2485
2486bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2487 // FIXME: It's not clear that a single instruction is an accurate model for
2488 // the inline cost of a resume instruction.
2489 return false;
2490}
2491
2492bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2493 // FIXME: It's not clear that a single instruction is an accurate model for
2494 // the inline cost of a cleanupret instruction.
2495 return false;
2496}
2497
2498bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2499 // FIXME: It's not clear that a single instruction is an accurate model for
2500 // the inline cost of a catchret instruction.
2501 return false;
2502}
2503
2504bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2505 // FIXME: It might be reasonably to discount the cost of instructions leading
2506 // to unreachable as they have the lowest possible impact on both runtime and
2507 // code size.
2508 return true; // No actual code is needed for unreachable.
2509}
2510
2511bool CallAnalyzer::visitInstruction(Instruction &I) {
2512 // Some instructions are free. All of the free intrinsics can also be
2513 // handled by SROA, etc.
2516 return true;
2517
2518 // We found something we don't understand or can't handle. Mark any SROA-able
2519 // values in the operand list as no longer viable.
2520 for (const Use &Op : I.operands())
2521 disableSROA(Op);
2522
2523 return false;
2524}
2525
2526/// Analyze a basic block for its contribution to the inline cost.
2527///
2528/// This method walks the analyzer over every instruction in the given basic
2529/// block and accounts for their cost during inlining at this callsite. It
2530/// aborts early if the threshold has been exceeded or an impossible to inline
2531/// construct has been detected. It returns false if inlining is no longer
2532/// viable, and true if inlining remains viable.
2534CallAnalyzer::analyzeBlock(BasicBlock *BB,
2535 SmallPtrSetImpl<const Value *> &EphValues) {
2536 for (Instruction &I : *BB) {
2537 // FIXME: Currently, the number of instructions in a function regardless of
2538 // our ability to simplify them during inline to constants or dead code,
2539 // are actually used by the vector bonus heuristic. As long as that's true,
2540 // we have to special case debug intrinsics here to prevent differences in
2541 // inlining due to debug symbols. Eventually, the number of unsimplified
2542 // instructions shouldn't factor into the cost computation, but until then,
2543 // hack around it here.
2544 // Similarly, skip pseudo-probes.
2545 if (I.isDebugOrPseudoInst())
2546 continue;
2547
2548 // Skip ephemeral values.
2549 if (EphValues.count(&I))
2550 continue;
2551
2552 ++NumInstructions;
2553 if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2554 ++NumVectorInstructions;
2555
2556 // If the instruction simplified to a constant, there is no cost to this
2557 // instruction. Visit the instructions using our InstVisitor to account for
2558 // all of the per-instruction logic. The visit tree returns true if we
2559 // consumed the instruction in any way, and false if the instruction's base
2560 // cost should count against inlining.
2561 onInstructionAnalysisStart(&I);
2562
2563 if (Base::visit(&I))
2564 ++NumInstructionsSimplified;
2565 else
2566 onMissedSimplification();
2567
2568 onInstructionAnalysisFinish(&I);
2569 using namespace ore;
2570 // If the visit this instruction detected an uninlinable pattern, abort.
2572 if (IsRecursiveCall && !AllowRecursiveCall)
2573 IR = InlineResult::failure("recursive");
2574 else if (ExposesReturnsTwice)
2575 IR = InlineResult::failure("exposes returns twice");
2576 else if (HasDynamicAlloca)
2577 IR = InlineResult::failure("dynamic alloca");
2578 else if (HasIndirectBr)
2579 IR = InlineResult::failure("indirect branch");
2580 else if (HasUninlineableIntrinsic)
2581 IR = InlineResult::failure("uninlinable intrinsic");
2582 else if (InitsVargArgs)
2583 IR = InlineResult::failure("varargs");
2584 if (!IR.isSuccess()) {
2585 if (ORE)
2586 ORE->emit([&]() {
2587 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2588 &CandidateCall)
2589 << NV("Callee", &F) << " has uninlinable pattern ("
2590 << NV("InlineResult", IR.getFailureReason())
2591 << ") and cost is not fully computed";
2592 });
2593 return IR;
2594 }
2595
2596 // If the caller is a recursive function then we don't want to inline
2597 // functions which allocate a lot of stack space because it would increase
2598 // the caller stack usage dramatically.
2599 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2600 auto IR =
2601 InlineResult::failure("recursive and allocates too much stack space");
2602 if (ORE)
2603 ORE->emit([&]() {
2604 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2605 &CandidateCall)
2606 << NV("Callee", &F) << " is "
2607 << NV("InlineResult", IR.getFailureReason())
2608 << ". Cost is not fully computed";
2609 });
2610 return IR;
2611 }
2612
2613 if (shouldStop())
2614 return InlineResult::failure(
2615 "Call site analysis is not favorable to inlining.");
2616 }
2617
2618 return InlineResult::success();
2619}
2620
2621/// Compute the base pointer and cumulative constant offsets for V.
2622///
2623/// This strips all constant offsets off of V, leaving it the base pointer, and
2624/// accumulates the total constant offset applied in the returned constant. It
2625/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2626/// no constant offsets applied.
2627ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2628 if (!V->getType()->isPointerTy())
2629 return nullptr;
2630
2631 unsigned AS = V->getType()->getPointerAddressSpace();
2632 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2633 APInt Offset = APInt::getZero(IntPtrWidth);
2634
2635 // Even though we don't look through PHI nodes, we could be called on an
2636 // instruction in an unreachable block, which may be on a cycle.
2638 Visited.insert(V);
2639 do {
2640 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2641 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2642 return nullptr;
2643 V = GEP->getPointerOperand();
2644 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
2645 V = cast<Operator>(V)->getOperand(0);
2646 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2647 if (GA->isInterposable())
2648 break;
2649 V = GA->getAliasee();
2650 } else {
2651 break;
2652 }
2653 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2654 } while (Visited.insert(V).second);
2655
2656 Type *IdxPtrTy = DL.getIndexType(V->getType());
2657 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2658}
2659
2660/// Find dead blocks due to deleted CFG edges during inlining.
2661///
2662/// If we know the successor of the current block, \p CurrBB, has to be \p
2663/// NextBB, the other successors of \p CurrBB are dead if these successors have
2664/// no live incoming CFG edges. If one block is found to be dead, we can
2665/// continue growing the dead block list by checking the successors of the dead
2666/// blocks to see if all their incoming edges are dead or not.
2667void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2668 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2669 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2670 // known successor which is not the one under exam.
2671 return (DeadBlocks.count(Pred) ||
2672 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2673 };
2674
2675 auto IsNewlyDead = [&](BasicBlock *BB) {
2676 // If all the edges to a block are dead, the block is also dead.
2677 return (!DeadBlocks.count(BB) &&
2679 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2680 };
2681
2682 for (BasicBlock *Succ : successors(CurrBB)) {
2683 if (Succ == NextBB || !IsNewlyDead(Succ))
2684 continue;
2686 NewDead.push_back(Succ);
2687 while (!NewDead.empty()) {
2688 BasicBlock *Dead = NewDead.pop_back_val();
2689 if (DeadBlocks.insert(Dead).second)
2690 // Continue growing the dead block lists.
2691 for (BasicBlock *S : successors(Dead))
2692 if (IsNewlyDead(S))
2693 NewDead.push_back(S);
2694 }
2695 }
2696}
2697
2698/// Analyze a call site for potential inlining.
2699///
2700/// Returns true if inlining this call is viable, and false if it is not
2701/// viable. It computes the cost and adjusts the threshold based on numerous
2702/// factors and heuristics. If this method returns false but the computed cost
2703/// is below the computed threshold, then inlining was forcibly disabled by
2704/// some artifact of the routine.
2705InlineResult CallAnalyzer::analyze() {
2706 ++NumCallsAnalyzed;
2707
2708 auto Result = onAnalysisStart();
2709 if (!Result.isSuccess())
2710 return Result;
2711
2712 if (F.empty())
2713 return InlineResult::success();
2714
2715 Function *Caller = CandidateCall.getFunction();
2716 // Check if the caller function is recursive itself.
2717 for (User *U : Caller->users()) {
2718 CallBase *Call = dyn_cast<CallBase>(U);
2719 if (Call && Call->getFunction() == Caller) {
2720 IsCallerRecursive = true;
2721 break;
2722 }
2723 }
2724
2725 // Populate our simplified values by mapping from function arguments to call
2726 // arguments with known important simplifications.
2727 auto CAI = CandidateCall.arg_begin();
2728 for (Argument &FAI : F.args()) {
2729 assert(CAI != CandidateCall.arg_end());
2730 if (Constant *C = dyn_cast<Constant>(CAI))
2731 SimplifiedValues[&FAI] = C;
2732
2733 Value *PtrArg = *CAI;
2734 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2735 ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2736
2737 // We can SROA any pointer arguments derived from alloca instructions.
2738 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2739 SROAArgValues[&FAI] = SROAArg;
2740 onInitializeSROAArg(SROAArg);
2741 EnabledSROAAllocas.insert(SROAArg);
2742 }
2743 }
2744 ++CAI;
2745 }
2746 NumConstantArgs = SimplifiedValues.size();
2747 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2748 NumAllocaArgs = SROAArgValues.size();
2749
2750 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2751 // the ephemeral values multiple times (and they're completely determined by
2752 // the callee, so this is purely duplicate work).
2754 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2755
2756 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2757 // adding basic blocks of the callee which can be proven to be dead for this
2758 // particular call site in order to get more accurate cost estimates. This
2759 // requires a somewhat heavyweight iteration pattern: we need to walk the
2760 // basic blocks in a breadth-first order as we insert live successors. To
2761 // accomplish this, prioritizing for small iterations because we exit after
2762 // crossing our threshold, we use a small-size optimized SetVector.
2764 BBSetVector BBWorklist;
2765 BBWorklist.insert(&F.getEntryBlock());
2766
2767 // Note that we *must not* cache the size, this loop grows the worklist.
2768 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2769 if (shouldStop())
2770 break;
2771
2772 BasicBlock *BB = BBWorklist[Idx];
2773 if (BB->empty())
2774 continue;
2775
2776 onBlockStart(BB);
2777
2778 // Disallow inlining a blockaddress with uses other than strictly callbr.
2779 // A blockaddress only has defined behavior for an indirect branch in the
2780 // same function, and we do not currently support inlining indirect
2781 // branches. But, the inliner may not see an indirect branch that ends up
2782 // being dead code at a particular call site. If the blockaddress escapes
2783 // the function, e.g., via a global variable, inlining may lead to an
2784 // invalid cross-function reference.
2785 // FIXME: pr/39560: continue relaxing this overt restriction.
2786 if (BB->hasAddressTaken())
2787 for (User *U : BlockAddress::get(&*BB)->users())
2788 if (!isa<CallBrInst>(*U))
2789 return InlineResult::failure("blockaddress used outside of callbr");
2790
2791 // Analyze the cost of this block. If we blow through the threshold, this
2792 // returns false, and we can bail on out.
2793 InlineResult IR = analyzeBlock(BB, EphValues);
2794 if (!IR.isSuccess())
2795 return IR;
2796
2797 Instruction *TI = BB->getTerminator();
2798
2799 // Add in the live successors by first checking whether we have terminator
2800 // that may be simplified based on the values simplified by this call.
2801 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2802 if (BI->isConditional()) {
2803 Value *Cond = BI->getCondition();
2804 if (ConstantInt *SimpleCond =
2805 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2806 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2807 BBWorklist.insert(NextBB);
2808 KnownSuccessors[BB] = NextBB;
2809 findDeadBlocks(BB, NextBB);
2810 continue;
2811 }
2812 }
2813 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2814 Value *Cond = SI->getCondition();
2815 if (ConstantInt *SimpleCond =
2816 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2817 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2818 BBWorklist.insert(NextBB);
2819 KnownSuccessors[BB] = NextBB;
2820 findDeadBlocks(BB, NextBB);
2821 continue;
2822 }
2823 }
2824
2825 // If we're unable to select a particular successor, just count all of
2826 // them.
2827 for (BasicBlock *Succ : successors(BB))
2828 BBWorklist.insert(Succ);
2829
2830 onBlockAnalyzed(BB);
2831 }
2832
2833 // If this is a noduplicate call, we can still inline as long as
2834 // inlining this would cause the removal of the caller (so the instruction
2835 // is not actually duplicated, just moved).
2836 if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
2837 return InlineResult::failure("noduplicate");
2838
2839 // If the callee's stack size exceeds the user-specified threshold,
2840 // do not let it be inlined.
2841 // The command line option overrides a limit set in the function attributes.
2842 size_t FinalStackSizeThreshold = StackSizeThreshold;
2843 if (!StackSizeThreshold.getNumOccurrences())
2844 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2846 FinalStackSizeThreshold = *AttrMaxStackSize;
2847 if (AllocatedSize > FinalStackSizeThreshold)
2848 return InlineResult::failure("stacksize");
2849
2850 return finalizeAnalysis();
2851}
2852
2853void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2854#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2856 F.print(OS, &Writer);
2857 DEBUG_PRINT_STAT(NumConstantArgs);
2858 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2859 DEBUG_PRINT_STAT(NumAllocaArgs);
2860 DEBUG_PRINT_STAT(NumConstantPtrCmps);
2861 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2862 DEBUG_PRINT_STAT(NumInstructionsSimplified);
2863 DEBUG_PRINT_STAT(NumInstructions);
2864 DEBUG_PRINT_STAT(SROACostSavings);
2865 DEBUG_PRINT_STAT(SROACostSavingsLost);
2866 DEBUG_PRINT_STAT(LoadEliminationCost);
2867 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2869 DEBUG_PRINT_STAT(Threshold);
2870#undef DEBUG_PRINT_STAT
2871}
2872
2873#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2874/// Dump stats about this call's analysis.
2875LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2876#endif
2877
2878/// Test that there are no attribute conflicts between Caller and Callee
2879/// that prevent inlining.
2881 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2882 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2883 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2884 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2885 // object, and always returns the same object (which is overwritten on each
2886 // GetTLI call). Therefore we copy the first result.
2887 auto CalleeTLI = GetTLI(*Callee);
2888 return (IgnoreTTIInlineCompatible ||
2889 TTI.areInlineCompatible(Caller, Callee)) &&
2890 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2892 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2893}
2894
2896 const DataLayout &DL) {
2897 int64_t Cost = 0;
2898 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2899 if (Call.isByValArgument(I)) {
2900 // We approximate the number of loads and stores needed by dividing the
2901 // size of the byval type by the target's pointer size.
2902 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2903 unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2904 unsigned AS = PTy->getAddressSpace();
2905 unsigned PointerSize = DL.getPointerSizeInBits(AS);
2906 // Ceiling division.
2907 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2908
2909 // If it generates more than 8 stores it is likely to be expanded as an
2910 // inline memcpy so we take that as an upper bound. Otherwise we assume
2911 // one load and one store per word copied.
2912 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2913 // here instead of a magic number of 8, but it's not available via
2914 // DataLayout.
2915 NumStores = std::min(NumStores, 8U);
2916
2917 Cost += 2 * NumStores * InstrCost;
2918 } else {
2919 // For non-byval arguments subtract off one instruction per call
2920 // argument.
2921 Cost += InstrCost;
2922 }
2923 }
2924 // The call instruction also disappears after inlining.
2925 Cost += InstrCost;
2926 Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
2927
2928 return std::min<int64_t>(Cost, INT_MAX);
2929}
2930
2932 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2933 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2934 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2937 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2938 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2939}
2940
2942 CallBase &Call, TargetTransformInfo &CalleeTTI,
2943 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2946 const InlineParams Params = {/* DefaultThreshold*/ 0,
2947 /*HintThreshold*/ {},
2948 /*ColdThreshold*/ {},
2949 /*OptSizeThreshold*/ {},
2950 /*OptMinSizeThreshold*/ {},
2951 /*HotCallSiteThreshold*/ {},
2952 /*LocallyHotCallSiteThreshold*/ {},
2953 /*ColdCallSiteThreshold*/ {},
2954 /*ComputeFullInlineCost*/ true,
2955 /*EnableDeferral*/ true};
2956
2957 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2958 GetAssumptionCache, GetBFI, PSI, ORE, true,
2959 /*IgnoreThreshold*/ true);
2960 auto R = CA.analyze();
2961 if (!R.isSuccess())
2962 return std::nullopt;
2963 return CA.getCost();
2964}
2965
2966std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2967 CallBase &Call, TargetTransformInfo &CalleeTTI,
2968 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2971 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2972 ORE, *Call.getCalledFunction(), Call);
2973 auto R = CFA.analyze();
2974 if (!R.isSuccess())
2975 return std::nullopt;
2976 return CFA.features();
2977}
2978
2980 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2981 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2982
2983 // Cannot inline indirect calls.
2984 if (!Callee)
2985 return InlineResult::failure("indirect call");
2986
2987 // When callee coroutine function is inlined into caller coroutine function
2988 // before coro-split pass,
2989 // coro-early pass can not handle this quiet well.
2990 // So we won't inline the coroutine function if it have not been unsplited
2991 if (Callee->isPresplitCoroutine())
2992 return InlineResult::failure("unsplited coroutine call");
2993
2994 // Never inline calls with byval arguments that does not have the alloca
2995 // address space. Since byval arguments can be replaced with a copy to an
2996 // alloca, the inlined code would need to be adjusted to handle that the
2997 // argument is in the alloca address space (so it is a little bit complicated
2998 // to solve).
2999 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
3000 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3001 if (Call.isByValArgument(I)) {
3002 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
3003 if (PTy->getAddressSpace() != AllocaAS)
3004 return InlineResult::failure("byval arguments without alloca"
3005 " address space");
3006 }
3007
3008 // Calls to functions with always-inline attributes should be inlined
3009 // whenever possible.
3010 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3011 if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3012 return InlineResult::failure("noinline call site attribute");
3013
3014 auto IsViable = isInlineViable(*Callee);
3015 if (IsViable.isSuccess())
3016 return InlineResult::success();
3017 return InlineResult::failure(IsViable.getFailureReason());
3018 }
3019
3020 // Never inline functions with conflicting attributes (unless callee has
3021 // always-inline attribute).
3022 Function *Caller = Call.getCaller();
3023 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
3024 return InlineResult::failure("conflicting attributes");
3025
3026 // Don't inline this call if the caller has the optnone attribute.
3027 if (Caller->hasOptNone())
3028 return InlineResult::failure("optnone attribute");
3029
3030 // Don't inline a function that treats null pointer as valid into a caller
3031 // that does not have this attribute.
3032 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3033 return InlineResult::failure("nullptr definitions incompatible");
3034
3035 // Don't inline functions which can be interposed at link-time.
3036 if (Callee->isInterposable())
3037 return InlineResult::failure("interposable");
3038
3039 // Don't inline functions marked noinline.
3040 if (Callee->hasFnAttribute(Attribute::NoInline))
3041 return InlineResult::failure("noinline function attribute");
3042
3043 // Don't inline call sites marked noinline.
3044 if (Call.isNoInline())
3045 return InlineResult::failure("noinline call site attribute");
3046
3047 return std::nullopt;
3048}
3049
3051 CallBase &Call, Function *Callee, const InlineParams &Params,
3052 TargetTransformInfo &CalleeTTI,
3053 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3054 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3057
3058 auto UserDecision =
3059 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3060
3061 if (UserDecision) {
3062 if (UserDecision->isSuccess())
3063 return llvm::InlineCost::getAlways("always inline attribute");
3064 return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3065 }
3066
3067 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3068 << "... (caller:" << Call.getCaller()->getName()
3069 << ")\n");
3070
3071 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3072 GetAssumptionCache, GetBFI, PSI, ORE);
3073 InlineResult ShouldInline = CA.analyze();
3074
3075 LLVM_DEBUG(CA.dump());
3076
3077 // Always make cost benefit based decision explicit.
3078 // We use always/never here since threshold is not meaningful,
3079 // as it's not what drives cost-benefit analysis.
3080 if (CA.wasDecidedByCostBenefit()) {
3081 if (ShouldInline.isSuccess())
3082 return InlineCost::getAlways("benefit over cost",
3083 CA.getCostBenefitPair());
3084 else
3085 return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3086 }
3087
3088 if (CA.wasDecidedByCostThreshold())
3089 return InlineCost::get(CA.getCost(), CA.getThreshold(),
3090 CA.getStaticBonusApplied());
3091
3092 // No details on how the decision was made, simply return always or never.
3093 return ShouldInline.isSuccess()
3094 ? InlineCost::getAlways("empty function")
3095 : InlineCost::getNever(ShouldInline.getFailureReason());
3096}
3097
3099 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3100 for (BasicBlock &BB : F) {
3101 // Disallow inlining of functions which contain indirect branches.
3102 if (isa<IndirectBrInst>(BB.getTerminator()))
3103 return InlineResult::failure("contains indirect branches");
3104
3105 // Disallow inlining of blockaddresses which are used by non-callbr
3106 // instructions.
3107 if (BB.hasAddressTaken())
3108 for (User *U : BlockAddress::get(&BB)->users())
3109 if (!isa<CallBrInst>(*U))
3110 return InlineResult::failure("blockaddress used outside of callbr");
3111
3112 for (auto &II : BB) {
3113 CallBase *Call = dyn_cast<CallBase>(&II);
3114 if (!Call)
3115 continue;
3116
3117 // Disallow recursive calls.
3118 Function *Callee = Call->getCalledFunction();
3119 if (&F == Callee)
3120 return InlineResult::failure("recursive call");
3121
3122 // Disallow calls which expose returns-twice to a function not previously
3123 // attributed as such.
3124 if (!ReturnsTwice && isa<CallInst>(Call) &&
3125 cast<CallInst>(Call)->canReturnTwice())
3126 return InlineResult::failure("exposes returns-twice attribute");
3127
3128 if (Callee)
3129 switch (Callee->getIntrinsicID()) {
3130 default:
3131 break;
3132 case llvm::Intrinsic::icall_branch_funnel:
3133 // Disallow inlining of @llvm.icall.branch.funnel because current
3134 // backend can't separate call targets from call arguments.
3135 return InlineResult::failure(
3136 "disallowed inlining of @llvm.icall.branch.funnel");
3137 case llvm::Intrinsic::localescape:
3138 // Disallow inlining functions that call @llvm.localescape. Doing this
3139 // correctly would require major changes to the inliner.
3140 return InlineResult::failure(
3141 "disallowed inlining of @llvm.localescape");
3142 case llvm::Intrinsic::vastart:
3143 // Disallow inlining of functions that initialize VarArgs with
3144 // va_start.
3145 return InlineResult::failure(
3146 "contains VarArgs initialized with va_start");
3147 }
3148 }
3149 }
3150
3151 return InlineResult::success();
3152}
3153
3154// APIs to create InlineParams based on command line flags and/or other
3155// parameters.
3156
3158 InlineParams Params;
3159
3160 // This field is the threshold to use for a callee by default. This is
3161 // derived from one or more of:
3162 // * optimization or size-optimization levels,
3163 // * a value passed to createFunctionInliningPass function, or
3164 // * the -inline-threshold flag.
3165 // If the -inline-threshold flag is explicitly specified, that is used
3166 // irrespective of anything else.
3167 if (InlineThreshold.getNumOccurrences() > 0)
3169 else
3170 Params.DefaultThreshold = Threshold;
3171
3172 // Set the HintThreshold knob from the -inlinehint-threshold.
3174
3175 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3177
3178 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3179 // populate LocallyHotCallSiteThreshold. Later, we populate
3180 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3181 // we know that optimization level is O3 (in the getInlineParams variant that
3182 // takes the opt and size levels).
3183 // FIXME: Remove this check (and make the assignment unconditional) after
3184 // addressing size regression issues at O2.
3185 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3187
3188 // Set the ColdCallSiteThreshold knob from the
3189 // -inline-cold-callsite-threshold.
3191
3192 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3193 // -inlinehint-threshold commandline option is not explicitly given. If that
3194 // option is present, then its value applies even for callees with size and
3195 // minsize attributes.
3196 // If the -inline-threshold is not specified, set the ColdThreshold from the
3197 // -inlinecold-threshold even if it is not explicitly passed. If
3198 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3199 // explicitly specified to set the ColdThreshold knob
3200 if (InlineThreshold.getNumOccurrences() == 0) {
3204 } else if (ColdThreshold.getNumOccurrences() > 0) {
3206 }
3207 return Params;
3208}
3209
3212}
3213
3214// Compute the default threshold for inlining based on the opt level and the
3215// size opt level.
3216static int computeThresholdFromOptLevels(unsigned OptLevel,
3217 unsigned SizeOptLevel) {
3218 if (OptLevel > 2)
3220 if (SizeOptLevel == 1) // -Os
3222 if (SizeOptLevel == 2) // -Oz
3224 return DefaultThreshold;
3225}
3226
3227InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3228 auto Params =
3229 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3230 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3231 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3232 // when it is specified explicitly.
3233 if (OptLevel > 2)
3235 return Params;
3236}
3237
3242 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3243 [&](Function &F) -> AssumptionCache & {
3245 };
3246 Module *M = F.getParent();
3247 ProfileSummaryInfo PSI(*M);
3248 DataLayout DL(M);
3250 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3251 // In the current implementation, the type of InlineParams doesn't matter as
3252 // the pass serves only for verification of inliner's decisions.
3253 // We can add a flag which determines InlineParams for this run. Right now,
3254 // the default InlineParams are used.
3255 const InlineParams Params = llvm::getInlineParams();
3256 for (BasicBlock &BB : F) {
3257 for (Instruction &I : BB) {
3258 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
3259 Function *CalledFunction = CI->getCalledFunction();
3260 if (!CalledFunction || CalledFunction->isDeclaration())
3261 continue;
3262 OptimizationRemarkEmitter ORE(CalledFunction);
3263 InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3264 GetAssumptionCache, nullptr, &PSI, &ORE);
3265 ICCA.analyze();
3266 OS << " Analyzing call of " << CalledFunction->getName()
3267 << "... (caller:" << CI->getCaller()->getName() << ")\n";
3268 ICCA.print(OS);
3269 OS << "\n";
3270 }
3271 }
3272 }
3273 return PreservedAnalyses::all();
3274}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
SetVector< BasicBlock * > BBSetVector
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI)
Return true if the block containing the call site has a BlockFrequency of less than ColdCCRelFreq% of...
Definition: GlobalOpt.cpp:1727
Hexagon Common GEP
iv users
Definition: IVUsers.cpp:48
static cl::opt< int > InlineSavingsMultiplier("inline-savings-multiplier", cl::Hidden, cl::init(8), cl::desc("Multiplier to multiply cycle savings by during inlining"))
static cl::opt< int > InlineThreshold("inline-threshold", cl::Hidden, cl::init(225), cl::desc("Control the amount of inlining to perform (default = 225)"))
static cl::opt< int > CallPenalty("inline-call-penalty", cl::Hidden, cl::init(25), cl::desc("Call penalty that is applied per callsite when inlining"))
static cl::opt< int > HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), cl::desc("Threshold for hot callsites "))
static cl::opt< int > ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining functions with cold attribute"))
static cl::opt< size_t > RecurStackSizeThreshold("recursive-inline-max-stacksize", cl::Hidden, cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller), cl::desc("Do not inline recursive functions with a stack " "size that exceeds the specified limit"))
static cl::opt< bool > PrintInstructionComments("print-instruction-comments", cl::Hidden, cl::init(false), cl::desc("Prints comments for instruction based on inline cost analysis"))
static cl::opt< int > LocallyHotCallSiteThreshold("locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::desc("Threshold for locally hot callsites "))
static cl::opt< bool > InlineCallerSupersetNoBuiltin("inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true), cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " "attributes."))
static cl::opt< int > HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint"))
static cl::opt< size_t > StackSizeThreshold("inline-max-stacksize", cl::Hidden, cl::init(std::numeric_limits< size_t >::max()), cl::desc("Do not inline functions with a stack size " "that exceeds the specified limit"))
static int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel)
static cl::opt< uint64_t > HotCallSiteRelFreq("hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::desc("Minimum block frequency, expressed as a multiple of caller's " "entry frequency, for a callsite to be hot in the absence of " "profile information."))
static cl::opt< int > InlineSavingsProfitableMultiplier("inline-savings-profitable-multiplier", cl::Hidden, cl::init(4), cl::desc("A multiplier on top of cycle savings to decide whether the " "savings won't justify the cost"))
static cl::opt< int > MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0), cl::desc("Cost of load/store instruction when inlining"))
static cl::opt< int > ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites"))
static cl::opt< bool > IgnoreTTIInlineCompatible("ignore-tti-inline-compatible", cl::Hidden, cl::init(false), cl::desc("Ignore TTI attributes compatibility check between callee/caller " "during inline cost calculation"))
static cl::opt< bool > OptComputeFullInlineCost("inline-cost-full", cl::Hidden, cl::desc("Compute the full inline cost of a call site even when the cost " "exceeds the threshold."))
#define DEBUG_PRINT_STAT(x)
static cl::opt< bool > InlineEnableCostBenefitAnalysis("inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false), cl::desc("Enable the cost-benefit analysis for the inliner"))
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static cl::opt< int > InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100), cl::desc("The maximum size of a callee that get's " "inlined without sufficient cycle savings"))
static bool functionsHaveCompatibleAttributes(Function *Caller, Function *Callee, TargetTransformInfo &TTI, function_ref< const TargetLibraryInfo &(Function &)> &GetTLI)
Test that there are no attribute conflicts between Caller and Callee that prevent inlining.
#define DEBUG_TYPE
Definition: InlineCost.cpp:52
static cl::opt< int > ColdCallSiteRelFreq("cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::desc("Maximum block frequency, expressed as a percentage of caller's " "entry frequency, for a callsite to be cold in the absence of " "profile information."))
static cl::opt< bool > DisableGEPConstOperand("disable-gep-const-evaluation", cl::Hidden, cl::init(false), cl::desc("Disables evaluation of GetElementPtr with constant operands"))
static cl::opt< int > DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225), cl::desc("Default amount of inlining to perform"))
static Constant * getFalse(Type *Ty)
For a boolean type or a vector of boolean type, return false or a vector with every element false.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
#define P(N)
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1579
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1083
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1193
an instruction to allocate memory on the stack
Definition: Instructions.h:59
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:348
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:500
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
virtual void emitInstructionAnnot(const Instruction *, formatted_raw_ostream &)
emitInstructionAnnot - This may be implemented to emit a string right before an instruction is emitte...
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:318
AttrKind
This enumeration lists the attributes that can be associated with parameters, function results,...
Definition: Attributes.h:84
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:184
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
bool empty() const
Definition: BasicBlock.h:460
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches,...
Definition: BasicBlock.h:648
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:214
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
This class represents a no-op cast from one type to another.
static BlockAddress * get(Function *F, BasicBlock *BB)
Return a BlockAddress for the specified function and basic block.
Definition: Constants.cpp:1846
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
std::optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
BlockFrequency getEntryFreq() const
BlockFrequency getBlockFreq(const BasicBlock *BB) const
getblockFreq - Return block frequency.
std::optional< BlockFrequency > mul(uint64_t Factor) const
Multiplies frequency with Factor. Returns nullopt in case of overflow.
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1457
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1696
bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Determine whether the argument or parameter has the given attribute.
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1616
Attribute getFnAttr(StringRef Kind) const
Get the attribute of a given kind for the function.
Definition: InstrTypes.h:1919
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1641
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1622
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1554
Function * getCaller()
Helper to get the caller (the parent function).
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:581
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:957
@ ICMP_NE
not equal
Definition: InstrTypes.h:989
static Constant * getSub(Constant *C1, Constant *C2, bool HasNUW=false, bool HasNSW=false)
Definition: Constants.cpp:2544
static Constant * getICmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced=false)
get* - Return some common constants without having to specify the full Instruction::OPCODE identifier...
Definition: Constants.cpp:2404
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:849
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:204
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:153
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:144
This is an important base class in LLVM.
Definition: Constant.h:41
bool isAllOnesValue() const
Return true if this is the value that would be returned by getAllOnesValue.
Definition: Constants.cpp:107
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
This instruction extracts a struct member or array element value from an aggregate value.
Type * getReturnType() const
Definition: DerivedTypes.h:124
Class to represent profile counts.
Definition: Function.h:277
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:274
Indirect Branch Instruction.
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
InlineResult is basically true or false.
Definition: InlineCost.h:179
static InlineResult success()
Definition: InlineCost.h:184
static InlineResult failure(const char *Reason)
Definition: InlineCost.h:185
bool isSuccess() const
Definition: InlineCost.h:188
const char * getFailureReason() const
Definition: InlineCost.h:189
This instruction inserts a struct field of array element value into an aggregate value.
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIndirectBrInst(IndirectBrInst &I)
Definition: InstVisitor.h:235
RetTy visitCmpInst(CmpInst &I)
Definition: InstVisitor.h:262
RetTy visitCallBase(CallBase &I)
Definition: InstVisitor.h:267
RetTy visitCleanupReturnInst(CleanupReturnInst &I)
Definition: InstVisitor.h:244
RetTy visitUnreachableInst(UnreachableInst &I)
Definition: InstVisitor.h:241
RetTy visitSwitchInst(SwitchInst &I)
Definition: InstVisitor.h:232
void visit(Iterator Start, Iterator End)
Definition: InstVisitor.h:87
RetTy visitReturnInst(ReturnInst &I)
Definition: InstVisitor.h:226
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:261
RetTy visitResumeInst(ResumeInst &I)
Definition: InstVisitor.h:238
RetTy visitCatchReturnInst(CatchReturnInst &I)
Definition: InstVisitor.h:247
RetTy visitCastInst(CastInst &I)
Definition: InstVisitor.h:259
RetTy visitBranchInst(BranchInst &I)
Definition: InstVisitor.h:229
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:189
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:280
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const BasicBlock * getParent() const
Definition: Instruction.h:150
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:85
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:357
This class represents a cast from an integer to a pointer.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Invoke instruction.
An instruction for reading from memory.
Definition: Instructions.h:184
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Class to represent pointers.
Definition: DerivedTypes.h:646
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
Analysis providing profile information.
This class represents a cast from a pointer to an integer.
Resume the propagation of an exception.
Return a value (possibly void), from a function.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
void reserve(size_type N)
Definition: SmallVector.h:676
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:466
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition: DataLayout.h:622
TypeSize getElementOffset(unsigned Idx) const
Definition: DataLayout.h:651
Class to represent struct types.
Definition: DerivedTypes.h:216
Multiway switch.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
Returns a penalty for invoking call Call in F.
unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
@ TCK_SizeAndLatency
The weighted sum of size and latency.
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
unsigned getInliningThresholdMultiplier() const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getFPOpCost(Type *Ty) const
Return the expected cost of supporting the floating point operation of the specified type.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
formatted_raw_ostream - A raw_ostream that wraps another one and keeps track of line and column posit...
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
bool areInlineCompatible(const Function &Caller, const Function &Callee)
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const int ColdccPenalty
Definition: InlineCost.h:51
const char FunctionInlineCostMultiplierAttributeName[]
Definition: InlineCost.h:59
const int OptSizeThreshold
Use when optsize (-Os) is specified.
Definition: InlineCost.h:38
const int OptMinSizeThreshold
Use when minsize (-Oz) is specified.
Definition: InlineCost.h:41
const int LoopPenalty
Definition: InlineCost.h:49
const uint64_t MaxSimplifiedDynamicAllocaToInline
Do not inline dynamic allocas that have been constant propagated to be static allocas above this amou...
Definition: InlineCost.h:57
const int IndirectCallThreshold
Definition: InlineCost.h:48
const int OptAggressiveThreshold
Use when -O3 is specified.
Definition: InlineCost.h:44
const char MaxInlineStackSizeAttributeName[]
Definition: InlineCost.h:62
const int LastCallToStaticBonus
Definition: InlineCost.h:50
const unsigned TotalAllocaSizeRecursiveCaller
Do not inline functions which allocate this many bytes on the stack when the caller is recursive.
Definition: InlineCost.h:54
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
@ Dead
Unused definition.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
Constant * ConstantFoldSelectInstruction(Constant *Cond, Constant *V1, Constant *V2)
Attempt to constant fold a select instruction with the specified operands.
bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
bool canConstantFoldCallTo(const CallBase *Call, const Function *F)
canConstantFoldCallTo - Return true if its even possible to fold a call to the specified function.
Function::ProfileCount ProfileCount
std::optional< int > getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind)
Definition: InlineCost.cpp:186
auto successors(const MachineBasicBlock *BB)
Value * lowerObjectSizeCall(IntrinsicInst *ObjectSize, const DataLayout &DL, const TargetLibraryInfo *TLI, bool MustSucceed)
Try to turn a call to @llvm.objectsize into an integer value of the given Type.
std::optional< InlineCostFeatures > getInliningCostFeatures(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the expanded cost features.
gep_type_iterator gep_type_end(const User *GEP)
Constant * ConstantFoldCall(const CallBase *Call, Function *F, ArrayRef< Constant * > Operands, const TargetLibraryInfo *TLI=nullptr)
ConstantFoldCall - Attempt to constant fold a call to the specified function with the specified argum...
Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Value * simplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q)
Given operand for an FNeg, fold the result or return null.
Constant * ConstantFoldInstOperands(Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed=nullptr)
Multiply two unsigned integers, X and Y, and add the unsigned integer, A to the product.
Definition: MathExtras.h:553
std::array< int, static_cast< size_t >(InlineCostFeatureIndex::NumberOfFeatures)> InlineCostFeatures
TargetTransformInfo TTI
std::optional< InlineResult > getAttributeBasedInliningDecision(CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, function_ref< const TargetLibraryInfo &(Function &)> GetTLI)
Returns InlineResult::success() if the call site should be always inlined because of user directives,...
Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
int getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
std::optional< int > getInliningCostEstimate(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the cost estimate ignoring thresholds.
gep_type_iterator gep_type_begin(const User *GEP)
auto predecessors(const MachineBasicBlock *BB)
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingAdd(T X, T Y, bool *ResultOverflowed=nullptr)
Add two unsigned integers, X and Y, of type T.
Definition: MathExtras.h:478
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< int > OptMinSizeThreshold
Threshold to use when the caller is optimized for minsize.
Definition: InlineCost.h:219
std::optional< int > OptSizeThreshold
Threshold to use when the caller is optimized for size.
Definition: InlineCost.h:216
std::optional< int > ColdCallSiteThreshold
Threshold to use when the callsite is considered cold.
Definition: InlineCost.h:229
std::optional< int > ColdThreshold
Threshold to use for cold callees.
Definition: InlineCost.h:213
std::optional< int > HotCallSiteThreshold
Threshold to use when the callsite is considered hot.
Definition: InlineCost.h:222
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
int DefaultThreshold
The default threshold to start with for a callee.
Definition: InlineCost.h:207
std::optional< int > HintThreshold
Threshold to use for callees with inline hint.
Definition: InlineCost.h:210
std::optional< int > LocallyHotCallSiteThreshold
Threshold to use when the callsite is considered hot relative to function entry.
Definition: InlineCost.h:226