LLVM 18.0.0git
InlineCost.cpp
Go to the documentation of this file.
1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
18#include "llvm/ADT/Statistic.h"
31#include "llvm/Config/llvm-config.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/Dominators.h"
37#include "llvm/IR/GlobalAlias.h"
38#include "llvm/IR/InstVisitor.h"
40#include "llvm/IR/Operator.h"
43#include "llvm/Support/Debug.h"
46#include <climits>
47#include <limits>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "inline-cost"
53
54STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
55
56static cl::opt<int>
57 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
58 cl::desc("Default amount of inlining to perform"));
59
60// We introduce this option since there is a minor compile-time win by avoiding
61// addition of TTI attributes (target-features in particular) to inline
62// candidates when they are guaranteed to be the same as top level methods in
63// some use cases. If we avoid adding the attribute, we need an option to avoid
64// checking these attributes.
66 "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
67 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68 "during inline cost calculation"));
69
71 "print-instruction-comments", cl::Hidden, cl::init(false),
72 cl::desc("Prints comments for instruction based on inline cost analysis"));
73
75 "inline-threshold", cl::Hidden, cl::init(225),
76 cl::desc("Control the amount of inlining to perform (default = 225)"));
77
79 "inlinehint-threshold", cl::Hidden, cl::init(325),
80 cl::desc("Threshold for inlining functions with inline hint"));
81
82static cl::opt<int>
83 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84 cl::init(45),
85 cl::desc("Threshold for inlining cold callsites"));
86
88 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
89 cl::desc("Enable the cost-benefit analysis for the inliner"));
90
91// InlineSavingsMultiplier overrides per TTI multipliers iff it is
92// specified explicitly in command line options. This option is exposed
93// for tuning and testing.
95 "inline-savings-multiplier", cl::Hidden, cl::init(8),
96 cl::desc("Multiplier to multiply cycle savings by during inlining"));
97
98// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99// specified explicitly in command line options. This option is exposed
100// for tuning and testing.
102 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
103 cl::desc("A multiplier on top of cycle savings to decide whether the "
104 "savings won't justify the cost"));
105
106static cl::opt<int>
107 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
108 cl::desc("The maximum size of a callee that get's "
109 "inlined without sufficient cycle savings"));
110
111// We introduce this threshold to help performance of instrumentation based
112// PGO before we actually hook up inliner with analysis passes such as BPI and
113// BFI.
115 "inlinecold-threshold", cl::Hidden, cl::init(45),
116 cl::desc("Threshold for inlining functions with cold attribute"));
117
118static cl::opt<int>
119 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
120 cl::desc("Threshold for hot callsites "));
121
123 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
124 cl::desc("Threshold for locally hot callsites "));
125
127 "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
128 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129 "entry frequency, for a callsite to be cold in the absence of "
130 "profile information."));
131
133 "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
134 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135 "entry frequency, for a callsite to be hot in the absence of "
136 "profile information."));
137
138static cl::opt<int>
139 InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
140 cl::desc("Cost of a single instruction when inlining"));
141
142static cl::opt<int>
143 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
144 cl::desc("Cost of load/store instruction when inlining"));
145
147 "inline-call-penalty", cl::Hidden, cl::init(25),
148 cl::desc("Call penalty that is applied per callsite when inlining"));
149
150static cl::opt<size_t>
151 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
152 cl::init(std::numeric_limits<size_t>::max()),
153 cl::desc("Do not inline functions with a stack size "
154 "that exceeds the specified limit"));
155
157 "recursive-inline-max-stacksize", cl::Hidden,
159 cl::desc("Do not inline recursive functions with a stack "
160 "size that exceeds the specified limit"));
161
163 "inline-cost-full", cl::Hidden,
164 cl::desc("Compute the full inline cost of a call site even when the cost "
165 "exceeds the threshold."));
166
168 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
169 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
170 "attributes."));
171
173 "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
174 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
175
176namespace llvm {
177std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
178 if (Attr.isValid()) {
179 int AttrValue = 0;
180 if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
181 return AttrValue;
182 }
183 return std::nullopt;
184}
185
186std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
187 return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
188}
189
190std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
191 return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
192}
193
194namespace InlineConstants {
195int getInstrCost() { return InstrCost; }
196
197} // namespace InlineConstants
198
199} // namespace llvm
200
201namespace {
202class InlineCostCallAnalyzer;
203
204// This struct is used to store information about inline cost of a
205// particular instruction
206struct InstructionCostDetail {
207 int CostBefore = 0;
208 int CostAfter = 0;
209 int ThresholdBefore = 0;
210 int ThresholdAfter = 0;
211
212 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
213
214 int getCostDelta() const { return CostAfter - CostBefore; }
215
216 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
217};
218
219class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
220private:
221 InlineCostCallAnalyzer *const ICCA;
222
223public:
224 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
226 formatted_raw_ostream &OS) override;
227};
228
229/// Carry out call site analysis, in order to evaluate inlinability.
230/// NOTE: the type is currently used as implementation detail of functions such
231/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232/// expectation is that they come from the outer scope, from the wrapper
233/// functions. If we want to support constructing CallAnalyzer objects where
234/// lambdas are provided inline at construction, or where the object needs to
235/// otherwise survive past the scope of the provided functions, we need to
236/// revisit the argument types.
237class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
239 friend class InstVisitor<CallAnalyzer, bool>;
240
241protected:
242 virtual ~CallAnalyzer() = default;
243 /// The TargetTransformInfo available for this compilation.
245
246 /// Getter for the cache of @llvm.assume intrinsics.
247 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
248
249 /// Getter for BlockFrequencyInfo
251
252 /// Profile summary information.
254
255 /// The called function.
256 Function &F;
257
258 // Cache the DataLayout since we use it a lot.
259 const DataLayout &DL;
260
261 /// The OptimizationRemarkEmitter available for this compilation.
263
264 /// The candidate callsite being analyzed. Please do not use this to do
265 /// analysis in the caller function; we want the inline cost query to be
266 /// easily cacheable. Instead, use the cover function paramHasAttr.
267 CallBase &CandidateCall;
268
269 /// Extension points for handling callsite features.
270 // Called before a basic block was analyzed.
271 virtual void onBlockStart(const BasicBlock *BB) {}
272
273 /// Called after a basic block was analyzed.
274 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
275
276 /// Called before an instruction was analyzed
277 virtual void onInstructionAnalysisStart(const Instruction *I) {}
278
279 /// Called after an instruction was analyzed
280 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
281
282 /// Called at the end of the analysis of the callsite. Return the outcome of
283 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
284 /// the reason it can't.
285 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
286 /// Called when we're about to start processing a basic block, and every time
287 /// we are done processing an instruction. Return true if there is no point in
288 /// continuing the analysis (e.g. we've determined already the call site is
289 /// too expensive to inline)
290 virtual bool shouldStop() { return false; }
291
292 /// Called before the analysis of the callee body starts (with callsite
293 /// contexts propagated). It checks callsite-specific information. Return a
294 /// reason analysis can't continue if that's the case, or 'true' if it may
295 /// continue.
296 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
297 /// Called if the analysis engine decides SROA cannot be done for the given
298 /// alloca.
299 virtual void onDisableSROA(AllocaInst *Arg) {}
300
301 /// Called the analysis engine determines load elimination won't happen.
302 virtual void onDisableLoadElimination() {}
303
304 /// Called when we visit a CallBase, before the analysis starts. Return false
305 /// to stop further processing of the instruction.
306 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
307
308 /// Called to account for a call.
309 virtual void onCallPenalty() {}
310
311 /// Called to account for a load or store.
312 virtual void onMemAccess(){};
313
314 /// Called to account for the expectation the inlining would result in a load
315 /// elimination.
316 virtual void onLoadEliminationOpportunity() {}
317
318 /// Called to account for the cost of argument setup for the Call in the
319 /// callee's body (not the callsite currently under analysis).
320 virtual void onCallArgumentSetup(const CallBase &Call) {}
321
322 /// Called to account for a load relative intrinsic.
323 virtual void onLoadRelativeIntrinsic() {}
324
325 /// Called to account for a lowered call.
326 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
327 }
328
329 /// Account for a jump table of given size. Return false to stop further
330 /// processing the switch instruction
331 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
332
333 /// Account for a case cluster of given size. Return false to stop further
334 /// processing of the instruction.
335 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
336
337 /// Called at the end of processing a switch instruction, with the given
338 /// number of case clusters.
339 virtual void onFinalizeSwitch(unsigned JumpTableSize,
340 unsigned NumCaseCluster) {}
341
342 /// Called to account for any other instruction not specifically accounted
343 /// for.
344 virtual void onMissedSimplification() {}
345
346 /// Start accounting potential benefits due to SROA for the given alloca.
347 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
348
349 /// Account SROA savings for the AllocaInst value.
350 virtual void onAggregateSROAUse(AllocaInst *V) {}
351
352 bool handleSROA(Value *V, bool DoNotDisable) {
353 // Check for SROA candidates in comparisons.
354 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
355 if (DoNotDisable) {
356 onAggregateSROAUse(SROAArg);
357 return true;
358 }
359 disableSROAForArg(SROAArg);
360 }
361 return false;
362 }
363
364 bool IsCallerRecursive = false;
365 bool IsRecursiveCall = false;
366 bool ExposesReturnsTwice = false;
367 bool HasDynamicAlloca = false;
368 bool ContainsNoDuplicateCall = false;
369 bool HasReturn = false;
370 bool HasIndirectBr = false;
371 bool HasUninlineableIntrinsic = false;
372 bool InitsVargArgs = false;
373
374 /// Number of bytes allocated statically by the callee.
375 uint64_t AllocatedSize = 0;
376 unsigned NumInstructions = 0;
377 unsigned NumVectorInstructions = 0;
378
379 /// While we walk the potentially-inlined instructions, we build up and
380 /// maintain a mapping of simplified values specific to this callsite. The
381 /// idea is to propagate any special information we have about arguments to
382 /// this call through the inlinable section of the function, and account for
383 /// likely simplifications post-inlining. The most important aspect we track
384 /// is CFG altering simplifications -- when we prove a basic block dead, that
385 /// can cause dramatic shifts in the cost of inlining a function.
386 DenseMap<Value *, Constant *> SimplifiedValues;
387
388 /// Keep track of the values which map back (through function arguments) to
389 /// allocas on the caller stack which could be simplified through SROA.
391
392 /// Keep track of Allocas for which we believe we may get SROA optimization.
393 DenseSet<AllocaInst *> EnabledSROAAllocas;
394
395 /// Keep track of values which map to a pointer base and constant offset.
397
398 /// Keep track of dead blocks due to the constant arguments.
400
401 /// The mapping of the blocks to their known unique successors due to the
402 /// constant arguments.
404
405 /// Model the elimination of repeated loads that is expected to happen
406 /// whenever we simplify away the stores that would otherwise cause them to be
407 /// loads.
408 bool EnableLoadElimination = true;
409
410 /// Whether we allow inlining for recursive call.
411 bool AllowRecursiveCall = false;
412
413 SmallPtrSet<Value *, 16> LoadAddrSet;
414
415 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
416 auto It = SROAArgValues.find(V);
417 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
418 return nullptr;
419 return It->second;
420 }
421
422 // Custom simplification helper routines.
423 bool isAllocaDerivedArg(Value *V);
424 void disableSROAForArg(AllocaInst *SROAArg);
425 void disableSROA(Value *V);
426 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
427 void disableLoadElimination();
428 bool isGEPFree(GetElementPtrInst &GEP);
429 bool canFoldInboundsGEP(GetElementPtrInst &I);
430 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
431 bool simplifyCallSite(Function *F, CallBase &Call);
433 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
434 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
435 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
436
437 /// Return true if the given argument to the function being considered for
438 /// inlining has the given attribute set either at the call site or the
439 /// function declaration. Primarily used to inspect call site specific
440 /// attributes since these can be more precise than the ones on the callee
441 /// itself.
442 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
443
444 /// Return true if the given value is known non null within the callee if
445 /// inlined through this particular callsite.
446 bool isKnownNonNullInCallee(Value *V);
447
448 /// Return true if size growth is allowed when inlining the callee at \p Call.
449 bool allowSizeGrowth(CallBase &Call);
450
451 // Custom analysis routines.
452 InlineResult analyzeBlock(BasicBlock *BB,
454
455 // Disable several entry points to the visitor so we don't accidentally use
456 // them by declaring but not defining them here.
457 void visit(Module *);
458 void visit(Module &);
459 void visit(Function *);
460 void visit(Function &);
461 void visit(BasicBlock *);
462 void visit(BasicBlock &);
463
464 // Provide base case for our instruction visit.
466
467 // Our visit overrides.
468 bool visitAlloca(AllocaInst &I);
469 bool visitPHI(PHINode &I);
470 bool visitGetElementPtr(GetElementPtrInst &I);
471 bool visitBitCast(BitCastInst &I);
472 bool visitPtrToInt(PtrToIntInst &I);
473 bool visitIntToPtr(IntToPtrInst &I);
474 bool visitCastInst(CastInst &I);
475 bool visitCmpInst(CmpInst &I);
476 bool visitSub(BinaryOperator &I);
478 bool visitFNeg(UnaryOperator &I);
479 bool visitLoad(LoadInst &I);
480 bool visitStore(StoreInst &I);
481 bool visitExtractValue(ExtractValueInst &I);
482 bool visitInsertValue(InsertValueInst &I);
483 bool visitCallBase(CallBase &Call);
484 bool visitReturnInst(ReturnInst &RI);
485 bool visitBranchInst(BranchInst &BI);
486 bool visitSelectInst(SelectInst &SI);
487 bool visitSwitchInst(SwitchInst &SI);
489 bool visitResumeInst(ResumeInst &RI);
493
494public:
495 CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
496 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
497 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
498 ProfileSummaryInfo *PSI = nullptr,
499 OptimizationRemarkEmitter *ORE = nullptr)
500 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
501 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
502 CandidateCall(Call) {}
503
504 InlineResult analyze();
505
506 std::optional<Constant *> getSimplifiedValue(Instruction *I) {
507 if (SimplifiedValues.contains(I))
508 return SimplifiedValues[I];
509 return std::nullopt;
510 }
511
512 // Keep a bunch of stats about the cost savings found so we can print them
513 // out when debugging.
514 unsigned NumConstantArgs = 0;
515 unsigned NumConstantOffsetPtrArgs = 0;
516 unsigned NumAllocaArgs = 0;
517 unsigned NumConstantPtrCmps = 0;
518 unsigned NumConstantPtrDiffs = 0;
519 unsigned NumInstructionsSimplified = 0;
520
521 void dump();
522};
523
524// Considering forming a binary search, we should find the number of nodes
525// which is same as the number of comparisons when lowered. For a given
526// number of clusters, n, we can define a recursive function, f(n), to find
527// the number of nodes in the tree. The recursion is :
528// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
529// and f(n) = n, when n <= 3.
530// This will lead a binary tree where the leaf should be either f(2) or f(3)
531// when n > 3. So, the number of comparisons from leaves should be n, while
532// the number of non-leaf should be :
533// 2^(log2(n) - 1) - 1
534// = 2^log2(n) * 2^-1 - 1
535// = n / 2 - 1.
536// Considering comparisons from leaf and non-leaf nodes, we can estimate the
537// number of comparisons in a simple closed form :
538// n + n / 2 - 1 = n * 3 / 2 - 1
539int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
540 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
541}
542
543/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
544/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
545class InlineCostCallAnalyzer final : public CallAnalyzer {
546 const bool ComputeFullInlineCost;
547 int LoadEliminationCost = 0;
548 /// Bonus to be applied when percentage of vector instructions in callee is
549 /// high (see more details in updateThreshold).
550 int VectorBonus = 0;
551 /// Bonus to be applied when the callee has only one reachable basic block.
552 int SingleBBBonus = 0;
553
554 /// Tunable parameters that control the analysis.
555 const InlineParams &Params;
556
557 // This DenseMap stores the delta change in cost and threshold after
558 // accounting for the given instruction. The map is filled only with the
559 // flag PrintInstructionComments on.
561
562 /// Upper bound for the inlining cost. Bonuses are being applied to account
563 /// for speculative "expected profit" of the inlining decision.
564 int Threshold = 0;
565
566 /// The amount of StaticBonus applied.
567 int StaticBonusApplied = 0;
568
569 /// Attempt to evaluate indirect calls to boost its inline cost.
570 const bool BoostIndirectCalls;
571
572 /// Ignore the threshold when finalizing analysis.
573 const bool IgnoreThreshold;
574
575 // True if the cost-benefit-analysis-based inliner is enabled.
576 const bool CostBenefitAnalysisEnabled;
577
578 /// Inlining cost measured in abstract units, accounts for all the
579 /// instructions expected to be executed for a given function invocation.
580 /// Instructions that are statically proven to be dead based on call-site
581 /// arguments are not counted here.
582 int Cost = 0;
583
584 // The cumulative cost at the beginning of the basic block being analyzed. At
585 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
586 // the size of that basic block.
587 int CostAtBBStart = 0;
588
589 // The static size of live but cold basic blocks. This is "static" in the
590 // sense that it's not weighted by profile counts at all.
591 int ColdSize = 0;
592
593 // Whether inlining is decided by cost-threshold analysis.
594 bool DecidedByCostThreshold = false;
595
596 // Whether inlining is decided by cost-benefit analysis.
597 bool DecidedByCostBenefit = false;
598
599 // The cost-benefit pair computed by cost-benefit analysis.
600 std::optional<CostBenefitPair> CostBenefit;
601
602 bool SingleBB = true;
603
604 unsigned SROACostSavings = 0;
605 unsigned SROACostSavingsLost = 0;
606
607 /// The mapping of caller Alloca values to their accumulated cost savings. If
608 /// we have to disable SROA for one of the allocas, this tells us how much
609 /// cost must be added.
610 DenseMap<AllocaInst *, int> SROAArgCosts;
611
612 /// Return true if \p Call is a cold callsite.
613 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
614
615 /// Update Threshold based on callsite properties such as callee
616 /// attributes and callee hotness for PGO builds. The Callee is explicitly
617 /// passed to support analyzing indirect calls whose target is inferred by
618 /// analysis.
619 void updateThreshold(CallBase &Call, Function &Callee);
620 /// Return a higher threshold if \p Call is a hot callsite.
621 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
622 BlockFrequencyInfo *CallerBFI);
623
624 /// Handle a capped 'int' increment for Cost.
625 void addCost(int64_t Inc) {
626 Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);
627 Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);
628 }
629
630 void onDisableSROA(AllocaInst *Arg) override {
631 auto CostIt = SROAArgCosts.find(Arg);
632 if (CostIt == SROAArgCosts.end())
633 return;
634 addCost(CostIt->second);
635 SROACostSavings -= CostIt->second;
636 SROACostSavingsLost += CostIt->second;
637 SROAArgCosts.erase(CostIt);
638 }
639
640 void onDisableLoadElimination() override {
641 addCost(LoadEliminationCost);
642 LoadEliminationCost = 0;
643 }
644
645 bool onCallBaseVisitStart(CallBase &Call) override {
646 if (std::optional<int> AttrCallThresholdBonus =
647 getStringFnAttrAsInt(Call, "call-threshold-bonus"))
648 Threshold += *AttrCallThresholdBonus;
649
650 if (std::optional<int> AttrCallCost =
651 getStringFnAttrAsInt(Call, "call-inline-cost")) {
652 addCost(*AttrCallCost);
653 // Prevent further processing of the call since we want to override its
654 // inline cost, not just add to it.
655 return false;
656 }
657 return true;
658 }
659
660 void onCallPenalty() override { addCost(CallPenalty); }
661
662 void onMemAccess() override { addCost(MemAccessCost); }
663
664 void onCallArgumentSetup(const CallBase &Call) override {
665 // Pay the price of the argument setup. We account for the average 1
666 // instruction per call argument setup here.
667 addCost(Call.arg_size() * InstrCost);
668 }
669 void onLoadRelativeIntrinsic() override {
670 // This is normally lowered to 4 LLVM instructions.
671 addCost(3 * InstrCost);
672 }
673 void onLoweredCall(Function *F, CallBase &Call,
674 bool IsIndirectCall) override {
675 // We account for the average 1 instruction per call argument setup here.
676 addCost(Call.arg_size() * InstrCost);
677
678 // If we have a constant that we are calling as a function, we can peer
679 // through it and see the function target. This happens not infrequently
680 // during devirtualization and so we want to give it a hefty bonus for
681 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
682 // Pretend to inline the function, with a custom threshold.
683 if (IsIndirectCall && BoostIndirectCalls) {
684 auto IndirectCallParams = Params;
685 IndirectCallParams.DefaultThreshold =
687 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
688 /// to instantiate the derived class.
689 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
690 GetAssumptionCache, GetBFI, PSI, ORE, false);
691 if (CA.analyze().isSuccess()) {
692 // We were able to inline the indirect call! Subtract the cost from the
693 // threshold to get the bonus we want to apply, but don't go below zero.
694 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
695 }
696 } else
697 // Otherwise simply add the cost for merely making the call.
698 addCost(CallPenalty);
699 }
700
701 void onFinalizeSwitch(unsigned JumpTableSize,
702 unsigned NumCaseCluster) override {
703 // If suitable for a jump table, consider the cost for the table size and
704 // branch to destination.
705 // Maximum valid cost increased in this function.
706 if (JumpTableSize) {
707 int64_t JTCost =
708 static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost;
709
710 addCost(JTCost);
711 return;
712 }
713
714 if (NumCaseCluster <= 3) {
715 // Suppose a comparison includes one compare and one conditional branch.
716 addCost(NumCaseCluster * 2 * InstrCost);
717 return;
718 }
719
720 int64_t ExpectedNumberOfCompare =
721 getExpectedNumberOfCompare(NumCaseCluster);
722 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
723
724 addCost(SwitchCost);
725 }
726 void onMissedSimplification() override { addCost(InstrCost); }
727
728 void onInitializeSROAArg(AllocaInst *Arg) override {
729 assert(Arg != nullptr &&
730 "Should not initialize SROA costs for null value.");
731 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
732 SROACostSavings += SROAArgCost;
733 SROAArgCosts[Arg] = SROAArgCost;
734 }
735
736 void onAggregateSROAUse(AllocaInst *SROAArg) override {
737 auto CostIt = SROAArgCosts.find(SROAArg);
738 assert(CostIt != SROAArgCosts.end() &&
739 "expected this argument to have a cost");
740 CostIt->second += InstrCost;
741 SROACostSavings += InstrCost;
742 }
743
744 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
745
746 void onBlockAnalyzed(const BasicBlock *BB) override {
747 if (CostBenefitAnalysisEnabled) {
748 // Keep track of the static size of live but cold basic blocks. For now,
749 // we define a cold basic block to be one that's never executed.
750 assert(GetBFI && "GetBFI must be available");
751 BlockFrequencyInfo *BFI = &(GetBFI(F));
752 assert(BFI && "BFI must be available");
753 auto ProfileCount = BFI->getBlockProfileCount(BB);
754 if (*ProfileCount == 0)
755 ColdSize += Cost - CostAtBBStart;
756 }
757
758 auto *TI = BB->getTerminator();
759 // If we had any successors at this point, than post-inlining is likely to
760 // have them as well. Note that we assume any basic blocks which existed
761 // due to branches or switches which folded above will also fold after
762 // inlining.
763 if (SingleBB && TI->getNumSuccessors() > 1) {
764 // Take off the bonus we applied to the threshold.
765 Threshold -= SingleBBBonus;
766 SingleBB = false;
767 }
768 }
769
770 void onInstructionAnalysisStart(const Instruction *I) override {
771 // This function is called to store the initial cost of inlining before
772 // the given instruction was assessed.
774 return;
775 InstructionCostDetailMap[I].CostBefore = Cost;
776 InstructionCostDetailMap[I].ThresholdBefore = Threshold;
777 }
778
779 void onInstructionAnalysisFinish(const Instruction *I) override {
780 // This function is called to find new values of cost and threshold after
781 // the instruction has been assessed.
783 return;
784 InstructionCostDetailMap[I].CostAfter = Cost;
785 InstructionCostDetailMap[I].ThresholdAfter = Threshold;
786 }
787
788 bool isCostBenefitAnalysisEnabled() {
789 if (!PSI || !PSI->hasProfileSummary())
790 return false;
791
792 if (!GetBFI)
793 return false;
794
795 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
796 // Honor the explicit request from the user.
798 return false;
799 } else {
800 // Otherwise, require instrumentation profile.
801 if (!(PSI->hasInstrumentationProfile() || PSI->hasSampleProfile()))
802 return false;
803 }
804
805 auto *Caller = CandidateCall.getParent()->getParent();
806 if (!Caller->getEntryCount())
807 return false;
808
809 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
810 if (!CallerBFI)
811 return false;
812
813 // For now, limit to hot call site.
814 if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
815 return false;
816
817 // Make sure we have a nonzero entry count.
818 auto EntryCount = F.getEntryCount();
819 if (!EntryCount || !EntryCount->getCount())
820 return false;
821
822 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
823 if (!CalleeBFI)
824 return false;
825
826 return true;
827 }
828
829 // A helper function to choose between command line override and default.
830 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
831 if (InlineSavingsMultiplier.getNumOccurrences())
834 }
835
836 // A helper function to choose between command line override and default.
837 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
838 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
841 }
842
843 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
844 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
845 CandidateCall, "inline-cycle-savings-for-test")) {
846 CycleSavings = *AttrCycleSavings;
847 }
848
849 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
850 CandidateCall, "inline-runtime-cost-for-test")) {
851 Size = *AttrRuntimeCost;
852 }
853 }
854
855 // Determine whether we should inline the given call site, taking into account
856 // both the size cost and the cycle savings. Return std::nullopt if we don't
857 // have sufficient profiling information to determine.
858 std::optional<bool> costBenefitAnalysis() {
859 if (!CostBenefitAnalysisEnabled)
860 return std::nullopt;
861
862 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
863 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
864 // falling back to the cost-based metric.
865 // TODO: Improve this hacky condition.
866 if (Threshold == 0)
867 return std::nullopt;
868
869 assert(GetBFI);
870 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
871 assert(CalleeBFI);
872
873 // The cycle savings expressed as the sum of InstrCost
874 // multiplied by the estimated dynamic count of each instruction we can
875 // avoid. Savings come from the call site cost, such as argument setup and
876 // the call instruction, as well as the instructions that are folded.
877 //
878 // We use 128-bit APInt here to avoid potential overflow. This variable
879 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
880 // assumes that we can avoid or fold a billion instructions, each with a
881 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
882 // period on a 4GHz machine.
883 APInt CycleSavings(128, 0);
884
885 for (auto &BB : F) {
886 APInt CurrentSavings(128, 0);
887 for (auto &I : BB) {
888 if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
889 // Count a conditional branch as savings if it becomes unconditional.
890 if (BI->isConditional() &&
891 isa_and_nonnull<ConstantInt>(
892 SimplifiedValues.lookup(BI->getCondition()))) {
893 CurrentSavings += InstrCost;
894 }
895 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
896 if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))
897 CurrentSavings += InstrCost;
898 } else if (Value *V = dyn_cast<Value>(&I)) {
899 // Count an instruction as savings if we can fold it.
900 if (SimplifiedValues.count(V)) {
901 CurrentSavings += InstrCost;
902 }
903 }
904 }
905
906 auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
907 CurrentSavings *= *ProfileCount;
908 CycleSavings += CurrentSavings;
909 }
910
911 // Compute the cycle savings per call.
912 auto EntryProfileCount = F.getEntryCount();
913 assert(EntryProfileCount && EntryProfileCount->getCount());
914 auto EntryCount = EntryProfileCount->getCount();
915 CycleSavings += EntryCount / 2;
916 CycleSavings = CycleSavings.udiv(EntryCount);
917
918 // Compute the total savings for the call site.
919 auto *CallerBB = CandidateCall.getParent();
920 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
921 CycleSavings += getCallsiteCost(this->CandidateCall, DL);
922 CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
923
924 // Remove the cost of the cold basic blocks to model the runtime cost more
925 // accurately. Both machine block placement and function splitting could
926 // place cold blocks further from hot blocks.
927 int Size = Cost - ColdSize;
928
929 // Allow tiny callees to be inlined regardless of whether they meet the
930 // savings threshold.
932
933 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
934 CostBenefit.emplace(APInt(128, Size), CycleSavings);
935
936 // Let R be the ratio of CycleSavings to Size. We accept the inlining
937 // opportunity if R is really high and reject if R is really low. If R is
938 // somewhere in the middle, we fall back to the cost-based analysis.
939 //
940 // Specifically, let R = CycleSavings / Size, we accept the inlining
941 // opportunity if:
942 //
943 // PSI->getOrCompHotCountThreshold()
944 // R > -------------------------------------------------
945 // getInliningCostBenefitAnalysisSavingsMultiplier()
946 //
947 // and reject the inlining opportunity if:
948 //
949 // PSI->getOrCompHotCountThreshold()
950 // R <= ----------------------------------------------------
951 // getInliningCostBenefitAnalysisProfitableMultiplier()
952 //
953 // Otherwise, we fall back to the cost-based analysis.
954 //
955 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
956 // HotCountThreshold * Size) rather than division to avoid precision loss.
957 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
958 Threshold *= Size;
959
960 APInt UpperBoundCycleSavings = CycleSavings;
961 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
962 if (UpperBoundCycleSavings.uge(Threshold))
963 return true;
964
965 APInt LowerBoundCycleSavings = CycleSavings;
966 LowerBoundCycleSavings *=
967 getInliningCostBenefitAnalysisProfitableMultiplier();
968 if (LowerBoundCycleSavings.ult(Threshold))
969 return false;
970
971 // Otherwise, fall back to the cost-based analysis.
972 return std::nullopt;
973 }
974
975 InlineResult finalizeAnalysis() override {
976 // Loops generally act a lot like calls in that they act like barriers to
977 // movement, require a certain amount of setup, etc. So when optimising for
978 // size, we penalise any call sites that perform loops. We do this after all
979 // other costs here, so will likely only be dealing with relatively small
980 // functions (and hence DT and LI will hopefully be cheap).
981 auto *Caller = CandidateCall.getFunction();
982 if (Caller->hasMinSize()) {
983 DominatorTree DT(F);
984 LoopInfo LI(DT);
985 int NumLoops = 0;
986 for (Loop *L : LI) {
987 // Ignore loops that will not be executed
988 if (DeadBlocks.count(L->getHeader()))
989 continue;
990 NumLoops++;
991 }
992 addCost(NumLoops * InlineConstants::LoopPenalty);
993 }
994
995 // We applied the maximum possible vector bonus at the beginning. Now,
996 // subtract the excess bonus, if any, from the Threshold before
997 // comparing against Cost.
998 if (NumVectorInstructions <= NumInstructions / 10)
999 Threshold -= VectorBonus;
1000 else if (NumVectorInstructions <= NumInstructions / 2)
1001 Threshold -= VectorBonus / 2;
1002
1003 if (std::optional<int> AttrCost =
1004 getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
1005 Cost = *AttrCost;
1006
1007 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1008 CandidateCall,
1010 Cost *= *AttrCostMult;
1011
1012 if (std::optional<int> AttrThreshold =
1013 getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
1014 Threshold = *AttrThreshold;
1015
1016 if (auto Result = costBenefitAnalysis()) {
1017 DecidedByCostBenefit = true;
1018 if (*Result)
1019 return InlineResult::success();
1020 else
1021 return InlineResult::failure("Cost over threshold.");
1022 }
1023
1024 if (IgnoreThreshold)
1025 return InlineResult::success();
1026
1027 DecidedByCostThreshold = true;
1028 return Cost < std::max(1, Threshold)
1030 : InlineResult::failure("Cost over threshold.");
1031 }
1032
1033 bool shouldStop() override {
1034 if (IgnoreThreshold || ComputeFullInlineCost)
1035 return false;
1036 // Bail out the moment we cross the threshold. This means we'll under-count
1037 // the cost, but only when undercounting doesn't matter.
1038 if (Cost < Threshold)
1039 return false;
1040 DecidedByCostThreshold = true;
1041 return true;
1042 }
1043
1044 void onLoadEliminationOpportunity() override {
1045 LoadEliminationCost += InstrCost;
1046 }
1047
1048 InlineResult onAnalysisStart() override {
1049 // Perform some tweaks to the cost and threshold based on the direct
1050 // callsite information.
1051
1052 // We want to more aggressively inline vector-dense kernels, so up the
1053 // threshold, and we'll lower it if the % of vector instructions gets too
1054 // low. Note that these bonuses are some what arbitrary and evolved over
1055 // time by accident as much as because they are principled bonuses.
1056 //
1057 // FIXME: It would be nice to remove all such bonuses. At least it would be
1058 // nice to base the bonus values on something more scientific.
1059 assert(NumInstructions == 0);
1060 assert(NumVectorInstructions == 0);
1061
1062 // Update the threshold based on callsite properties
1063 updateThreshold(CandidateCall, F);
1064
1065 // While Threshold depends on commandline options that can take negative
1066 // values, we want to enforce the invariant that the computed threshold and
1067 // bonuses are non-negative.
1068 assert(Threshold >= 0);
1069 assert(SingleBBBonus >= 0);
1070 assert(VectorBonus >= 0);
1071
1072 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1073 // this Threshold any time, and cost cannot decrease, we can stop processing
1074 // the rest of the function body.
1075 Threshold += (SingleBBBonus + VectorBonus);
1076
1077 // Give out bonuses for the callsite, as the instructions setting them up
1078 // will be gone after inlining.
1079 addCost(-getCallsiteCost(this->CandidateCall, DL));
1080
1081 // If this function uses the coldcc calling convention, prefer not to inline
1082 // it.
1083 if (F.getCallingConv() == CallingConv::Cold)
1085
1086 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1087
1088 // Check if we're done. This can happen due to bonuses and penalties.
1089 if (Cost >= Threshold && !ComputeFullInlineCost)
1090 return InlineResult::failure("high cost");
1091
1092 return InlineResult::success();
1093 }
1094
1095public:
1096 InlineCostCallAnalyzer(
1097 Function &Callee, CallBase &Call, const InlineParams &Params,
1098 const TargetTransformInfo &TTI,
1099 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1100 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1101 ProfileSummaryInfo *PSI = nullptr,
1102 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1103 bool IgnoreThreshold = false)
1104 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1105 ComputeFullInlineCost(OptComputeFullInlineCost ||
1106 Params.ComputeFullInlineCost || ORE ||
1107 isCostBenefitAnalysisEnabled()),
1108 Params(Params), Threshold(Params.DefaultThreshold),
1109 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1110 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1111 Writer(this) {
1112 AllowRecursiveCall = *Params.AllowRecursiveCall;
1113 }
1114
1115 /// Annotation Writer for instruction details
1116 InlineCostAnnotationWriter Writer;
1117
1118 void dump();
1119
1120 // Prints the same analysis as dump(), but its definition is not dependent
1121 // on the build.
1122 void print(raw_ostream &OS);
1123
1124 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1125 if (InstructionCostDetailMap.contains(I))
1126 return InstructionCostDetailMap[I];
1127 return std::nullopt;
1128 }
1129
1130 virtual ~InlineCostCallAnalyzer() = default;
1131 int getThreshold() const { return Threshold; }
1132 int getCost() const { return Cost; }
1133 int getStaticBonusApplied() const { return StaticBonusApplied; }
1134 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1135 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1136 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1137};
1138
1139// Return true if CB is the sole call to local function Callee.
1140static bool isSoleCallToLocalFunction(const CallBase &CB,
1141 const Function &Callee) {
1142 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1143 &Callee == CB.getCalledFunction();
1144}
1145
1146class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1147private:
1149
1150 // FIXME: These constants are taken from the heuristic-based cost visitor.
1151 // These should be removed entirely in a later revision to avoid reliance on
1152 // heuristics in the ML inliner.
1153 static constexpr int JTCostMultiplier = 4;
1154 static constexpr int CaseClusterCostMultiplier = 2;
1155 static constexpr int SwitchCostMultiplier = 2;
1156
1157 // FIXME: These are taken from the heuristic-based cost visitor: we should
1158 // eventually abstract these to the CallAnalyzer to avoid duplication.
1159 unsigned SROACostSavingOpportunities = 0;
1160 int VectorBonus = 0;
1161 int SingleBBBonus = 0;
1162 int Threshold = 5;
1163
1165
1166 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1167 Cost[static_cast<size_t>(Feature)] += Delta;
1168 }
1169
1170 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1171 Cost[static_cast<size_t>(Feature)] = Value;
1172 }
1173
1174 void onDisableSROA(AllocaInst *Arg) override {
1175 auto CostIt = SROACosts.find(Arg);
1176 if (CostIt == SROACosts.end())
1177 return;
1178
1179 increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1180 SROACostSavingOpportunities -= CostIt->second;
1181 SROACosts.erase(CostIt);
1182 }
1183
1184 void onDisableLoadElimination() override {
1185 set(InlineCostFeatureIndex::load_elimination, 1);
1186 }
1187
1188 void onCallPenalty() override {
1189 increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1190 }
1191
1192 void onCallArgumentSetup(const CallBase &Call) override {
1193 increment(InlineCostFeatureIndex::call_argument_setup,
1194 Call.arg_size() * InstrCost);
1195 }
1196
1197 void onLoadRelativeIntrinsic() override {
1198 increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1199 }
1200
1201 void onLoweredCall(Function *F, CallBase &Call,
1202 bool IsIndirectCall) override {
1203 increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1204 Call.arg_size() * InstrCost);
1205
1206 if (IsIndirectCall) {
1207 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1208 /*HintThreshold*/ {},
1209 /*ColdThreshold*/ {},
1210 /*OptSizeThreshold*/ {},
1211 /*OptMinSizeThreshold*/ {},
1212 /*HotCallSiteThreshold*/ {},
1213 /*LocallyHotCallSiteThreshold*/ {},
1214 /*ColdCallSiteThreshold*/ {},
1215 /*ComputeFullInlineCost*/ true,
1216 /*EnableDeferral*/ true};
1217 IndirectCallParams.DefaultThreshold =
1219
1220 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1221 GetAssumptionCache, GetBFI, PSI, ORE, false,
1222 true);
1223 if (CA.analyze().isSuccess()) {
1224 increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1225 CA.getCost());
1226 increment(InlineCostFeatureIndex::nested_inlines, 1);
1227 }
1228 } else {
1229 onCallPenalty();
1230 }
1231 }
1232
1233 void onFinalizeSwitch(unsigned JumpTableSize,
1234 unsigned NumCaseCluster) override {
1235
1236 if (JumpTableSize) {
1237 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1238 JTCostMultiplier * InstrCost;
1239 increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1240 return;
1241 }
1242
1243 if (NumCaseCluster <= 3) {
1244 increment(InlineCostFeatureIndex::case_cluster_penalty,
1245 NumCaseCluster * CaseClusterCostMultiplier * InstrCost);
1246 return;
1247 }
1248
1249 int64_t ExpectedNumberOfCompare =
1250 getExpectedNumberOfCompare(NumCaseCluster);
1251
1252 int64_t SwitchCost =
1253 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1254 increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1255 }
1256
1257 void onMissedSimplification() override {
1258 increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1259 InstrCost);
1260 }
1261
1262 void onInitializeSROAArg(AllocaInst *Arg) override {
1263 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1264 SROACosts[Arg] = SROAArgCost;
1265 SROACostSavingOpportunities += SROAArgCost;
1266 }
1267
1268 void onAggregateSROAUse(AllocaInst *Arg) override {
1269 SROACosts.find(Arg)->second += InstrCost;
1270 SROACostSavingOpportunities += InstrCost;
1271 }
1272
1273 void onBlockAnalyzed(const BasicBlock *BB) override {
1274 if (BB->getTerminator()->getNumSuccessors() > 1)
1275 set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1276 Threshold -= SingleBBBonus;
1277 }
1278
1279 InlineResult finalizeAnalysis() override {
1280 auto *Caller = CandidateCall.getFunction();
1281 if (Caller->hasMinSize()) {
1282 DominatorTree DT(F);
1283 LoopInfo LI(DT);
1284 for (Loop *L : LI) {
1285 // Ignore loops that will not be executed
1286 if (DeadBlocks.count(L->getHeader()))
1287 continue;
1288 increment(InlineCostFeatureIndex::num_loops,
1290 }
1291 }
1292 set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1293 set(InlineCostFeatureIndex::simplified_instructions,
1294 NumInstructionsSimplified);
1295 set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1296 set(InlineCostFeatureIndex::constant_offset_ptr_args,
1297 NumConstantOffsetPtrArgs);
1298 set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1299
1300 if (NumVectorInstructions <= NumInstructions / 10)
1301 Threshold -= VectorBonus;
1302 else if (NumVectorInstructions <= NumInstructions / 2)
1303 Threshold -= VectorBonus / 2;
1304
1305 set(InlineCostFeatureIndex::threshold, Threshold);
1306
1307 return InlineResult::success();
1308 }
1309
1310 bool shouldStop() override { return false; }
1311
1312 void onLoadEliminationOpportunity() override {
1313 increment(InlineCostFeatureIndex::load_elimination, 1);
1314 }
1315
1316 InlineResult onAnalysisStart() override {
1317 increment(InlineCostFeatureIndex::callsite_cost,
1318 -1 * getCallsiteCost(this->CandidateCall, DL));
1319
1320 set(InlineCostFeatureIndex::cold_cc_penalty,
1321 (F.getCallingConv() == CallingConv::Cold));
1322
1323 set(InlineCostFeatureIndex::last_call_to_static_bonus,
1324 isSoleCallToLocalFunction(CandidateCall, F));
1325
1326 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1327 // analyzer - instead, we should abstract it to a common method in the
1328 // CallAnalyzer
1329 int SingleBBBonusPercent = 50;
1330 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1331 Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1332 Threshold *= TTI.getInliningThresholdMultiplier();
1333 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1334 VectorBonus = Threshold * VectorBonusPercent / 100;
1335 Threshold += (SingleBBBonus + VectorBonus);
1336
1337 return InlineResult::success();
1338 }
1339
1340public:
1341 InlineCostFeaturesAnalyzer(
1342 const TargetTransformInfo &TTI,
1343 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1346 CallBase &Call)
1347 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1348
1349 const InlineCostFeatures &features() const { return Cost; }
1350};
1351
1352} // namespace
1353
1354/// Test whether the given value is an Alloca-derived function argument.
1355bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1356 return SROAArgValues.count(V);
1357}
1358
1359void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1360 onDisableSROA(SROAArg);
1361 EnabledSROAAllocas.erase(SROAArg);
1362 disableLoadElimination();
1363}
1364
1365void InlineCostAnnotationWriter::emitInstructionAnnot(
1367 // The cost of inlining of the given instruction is printed always.
1368 // The threshold delta is printed only when it is non-zero. It happens
1369 // when we decided to give a bonus at a particular instruction.
1370 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1371 if (!Record)
1372 OS << "; No analysis for the instruction";
1373 else {
1374 OS << "; cost before = " << Record->CostBefore
1375 << ", cost after = " << Record->CostAfter
1376 << ", threshold before = " << Record->ThresholdBefore
1377 << ", threshold after = " << Record->ThresholdAfter << ", ";
1378 OS << "cost delta = " << Record->getCostDelta();
1379 if (Record->hasThresholdChanged())
1380 OS << ", threshold delta = " << Record->getThresholdDelta();
1381 }
1382 auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1383 if (C) {
1384 OS << ", simplified to ";
1385 (*C)->print(OS, true);
1386 }
1387 OS << "\n";
1388}
1389
1390/// If 'V' maps to a SROA candidate, disable SROA for it.
1391void CallAnalyzer::disableSROA(Value *V) {
1392 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1393 disableSROAForArg(SROAArg);
1394 }
1395}
1396
1397void CallAnalyzer::disableLoadElimination() {
1398 if (EnableLoadElimination) {
1399 onDisableLoadElimination();
1400 EnableLoadElimination = false;
1401 }
1402}
1403
1404/// Accumulate a constant GEP offset into an APInt if possible.
1405///
1406/// Returns false if unable to compute the offset for any reason. Respects any
1407/// simplified values known during the analysis of this callsite.
1408bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1409 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1410 assert(IntPtrWidth == Offset.getBitWidth());
1411
1413 GTI != GTE; ++GTI) {
1414 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1415 if (!OpC)
1416 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
1417 OpC = dyn_cast<ConstantInt>(SimpleOp);
1418 if (!OpC)
1419 return false;
1420 if (OpC->isZero())
1421 continue;
1422
1423 // Handle a struct index, which adds its field offset to the pointer.
1424 if (StructType *STy = GTI.getStructTypeOrNull()) {
1425 unsigned ElementIdx = OpC->getZExtValue();
1426 const StructLayout *SL = DL.getStructLayout(STy);
1427 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1428 continue;
1429 }
1430
1431 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
1432 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1433 }
1434 return true;
1435}
1436
1437/// Use TTI to check whether a GEP is free.
1438///
1439/// Respects any simplified values known during the analysis of this callsite.
1440bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1442 Operands.push_back(GEP.getOperand(0));
1443 for (const Use &Op : GEP.indices())
1444 if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1445 Operands.push_back(SimpleOp);
1446 else
1447 Operands.push_back(Op);
1451}
1452
1453bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1454 disableSROA(I.getOperand(0));
1455
1456 // Check whether inlining will turn a dynamic alloca into a static
1457 // alloca and handle that case.
1458 if (I.isArrayAllocation()) {
1459 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1460 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1461 // Sometimes a dynamic alloca could be converted into a static alloca
1462 // after this constant prop, and become a huge static alloca on an
1463 // unconditional CFG path. Avoid inlining if this is going to happen above
1464 // a threshold.
1465 // FIXME: If the threshold is removed or lowered too much, we could end up
1466 // being too pessimistic and prevent inlining non-problematic code. This
1467 // could result in unintended perf regressions. A better overall strategy
1468 // is needed to track stack usage during inlining.
1469 Type *Ty = I.getAllocatedType();
1470 AllocatedSize = SaturatingMultiplyAdd(
1471 AllocSize->getLimitedValue(),
1472 DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1474 HasDynamicAlloca = true;
1475 return false;
1476 }
1477 }
1478
1479 // Accumulate the allocated size.
1480 if (I.isStaticAlloca()) {
1481 Type *Ty = I.getAllocatedType();
1482 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1483 AllocatedSize);
1484 }
1485
1486 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1487 // a variety of reasons, and so we would like to not inline them into
1488 // functions which don't currently have a dynamic alloca. This simply
1489 // disables inlining altogether in the presence of a dynamic alloca.
1490 if (!I.isStaticAlloca())
1491 HasDynamicAlloca = true;
1492
1493 return false;
1494}
1495
1496bool CallAnalyzer::visitPHI(PHINode &I) {
1497 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1498 // though we don't want to propagate it's bonuses. The idea is to disable
1499 // SROA if it *might* be used in an inappropriate manner.
1500
1501 // Phi nodes are always zero-cost.
1502 // FIXME: Pointer sizes may differ between different address spaces, so do we
1503 // need to use correct address space in the call to getPointerSizeInBits here?
1504 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1505 // see the ZeroOffset is used as a dummy value, so we can probably use any
1506 // bit width for the ZeroOffset?
1507 APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1508 bool CheckSROA = I.getType()->isPointerTy();
1509
1510 // Track the constant or pointer with constant offset we've seen so far.
1511 Constant *FirstC = nullptr;
1512 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1513 Value *FirstV = nullptr;
1514
1515 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1516 BasicBlock *Pred = I.getIncomingBlock(i);
1517 // If the incoming block is dead, skip the incoming block.
1518 if (DeadBlocks.count(Pred))
1519 continue;
1520 // If the parent block of phi is not the known successor of the incoming
1521 // block, skip the incoming block.
1522 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1523 if (KnownSuccessor && KnownSuccessor != I.getParent())
1524 continue;
1525
1526 Value *V = I.getIncomingValue(i);
1527 // If the incoming value is this phi itself, skip the incoming value.
1528 if (&I == V)
1529 continue;
1530
1531 Constant *C = dyn_cast<Constant>(V);
1532 if (!C)
1533 C = SimplifiedValues.lookup(V);
1534
1535 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1536 if (!C && CheckSROA)
1537 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1538
1539 if (!C && !BaseAndOffset.first)
1540 // The incoming value is neither a constant nor a pointer with constant
1541 // offset, exit early.
1542 return true;
1543
1544 if (FirstC) {
1545 if (FirstC == C)
1546 // If we've seen a constant incoming value before and it is the same
1547 // constant we see this time, continue checking the next incoming value.
1548 continue;
1549 // Otherwise early exit because we either see a different constant or saw
1550 // a constant before but we have a pointer with constant offset this time.
1551 return true;
1552 }
1553
1554 if (FirstV) {
1555 // The same logic as above, but check pointer with constant offset here.
1556 if (FirstBaseAndOffset == BaseAndOffset)
1557 continue;
1558 return true;
1559 }
1560
1561 if (C) {
1562 // This is the 1st time we've seen a constant, record it.
1563 FirstC = C;
1564 continue;
1565 }
1566
1567 // The remaining case is that this is the 1st time we've seen a pointer with
1568 // constant offset, record it.
1569 FirstV = V;
1570 FirstBaseAndOffset = BaseAndOffset;
1571 }
1572
1573 // Check if we can map phi to a constant.
1574 if (FirstC) {
1575 SimplifiedValues[&I] = FirstC;
1576 return true;
1577 }
1578
1579 // Check if we can map phi to a pointer with constant offset.
1580 if (FirstBaseAndOffset.first) {
1581 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1582
1583 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1584 SROAArgValues[&I] = SROAArg;
1585 }
1586
1587 return true;
1588}
1589
1590/// Check we can fold GEPs of constant-offset call site argument pointers.
1591/// This requires target data and inbounds GEPs.
1592///
1593/// \return true if the specified GEP can be folded.
1594bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1595 // Check if we have a base + offset for the pointer.
1596 std::pair<Value *, APInt> BaseAndOffset =
1597 ConstantOffsetPtrs.lookup(I.getPointerOperand());
1598 if (!BaseAndOffset.first)
1599 return false;
1600
1601 // Check if the offset of this GEP is constant, and if so accumulate it
1602 // into Offset.
1603 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1604 return false;
1605
1606 // Add the result as a new mapping to Base + Offset.
1607 ConstantOffsetPtrs[&I] = BaseAndOffset;
1608
1609 return true;
1610}
1611
1612bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1613 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1614
1615 // Lambda to check whether a GEP's indices are all constant.
1616 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1617 for (const Use &Op : GEP.indices())
1618 if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
1619 return false;
1620 return true;
1621 };
1622
1625 return true;
1626
1627 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1628 if (SROAArg)
1629 SROAArgValues[&I] = SROAArg;
1630
1631 // Constant GEPs are modeled as free.
1632 return true;
1633 }
1634
1635 // Variable GEPs will require math and will disable SROA.
1636 if (SROAArg)
1637 disableSROAForArg(SROAArg);
1638 return isGEPFree(I);
1639}
1640
1641/// Simplify \p I if its operands are constants and update SimplifiedValues.
1642bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1644 for (Value *Op : I.operands()) {
1645 Constant *COp = dyn_cast<Constant>(Op);
1646 if (!COp)
1647 COp = SimplifiedValues.lookup(Op);
1648 if (!COp)
1649 return false;
1650 COps.push_back(COp);
1651 }
1652 auto *C = ConstantFoldInstOperands(&I, COps, DL);
1653 if (!C)
1654 return false;
1655 SimplifiedValues[&I] = C;
1656 return true;
1657}
1658
1659/// Try to simplify a call to llvm.is.constant.
1660///
1661/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1662/// we expect calls of this specific intrinsic to be infrequent.
1663///
1664/// FIXME: Given that we know CB's parent (F) caller
1665/// (CandidateCall->getParent()->getParent()), we might be able to determine
1666/// whether inlining F into F's caller would change how the call to
1667/// llvm.is.constant would evaluate.
1668bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1669 Value *Arg = CB.getArgOperand(0);
1670 auto *C = dyn_cast<Constant>(Arg);
1671
1672 if (!C)
1673 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
1674
1675 Type *RT = CB.getFunctionType()->getReturnType();
1676 SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1677 return true;
1678}
1679
1680bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1681 // As per the langref, "The fourth argument to llvm.objectsize determines if
1682 // the value should be evaluated at runtime."
1683 if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1684 return false;
1685
1686 Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1687 /*MustSucceed=*/true);
1688 Constant *C = dyn_cast_or_null<Constant>(V);
1689 if (C)
1690 SimplifiedValues[&CB] = C;
1691 return C;
1692}
1693
1694bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1695 // Propagate constants through bitcasts.
1697 return true;
1698
1699 // Track base/offsets through casts
1700 std::pair<Value *, APInt> BaseAndOffset =
1701 ConstantOffsetPtrs.lookup(I.getOperand(0));
1702 // Casts don't change the offset, just wrap it up.
1703 if (BaseAndOffset.first)
1704 ConstantOffsetPtrs[&I] = BaseAndOffset;
1705
1706 // Also look for SROA candidates here.
1707 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1708 SROAArgValues[&I] = SROAArg;
1709
1710 // Bitcasts are always zero cost.
1711 return true;
1712}
1713
1714bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1715 // Propagate constants through ptrtoint.
1717 return true;
1718
1719 // Track base/offset pairs when converted to a plain integer provided the
1720 // integer is large enough to represent the pointer.
1721 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1722 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1723 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1724 std::pair<Value *, APInt> BaseAndOffset =
1725 ConstantOffsetPtrs.lookup(I.getOperand(0));
1726 if (BaseAndOffset.first)
1727 ConstantOffsetPtrs[&I] = BaseAndOffset;
1728 }
1729
1730 // This is really weird. Technically, ptrtoint will disable SROA. However,
1731 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1732 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1733 // would block SROA would also block SROA if applied directly to a pointer,
1734 // and so we can just add the integer in here. The only places where SROA is
1735 // preserved either cannot fire on an integer, or won't in-and-of themselves
1736 // disable SROA (ext) w/o some later use that we would see and disable.
1737 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1738 SROAArgValues[&I] = SROAArg;
1739
1742}
1743
1744bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1745 // Propagate constants through ptrtoint.
1747 return true;
1748
1749 // Track base/offset pairs when round-tripped through a pointer without
1750 // modifications provided the integer is not too large.
1751 Value *Op = I.getOperand(0);
1752 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1753 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1754 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1755 if (BaseAndOffset.first)
1756 ConstantOffsetPtrs[&I] = BaseAndOffset;
1757 }
1758
1759 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1760 if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1761 SROAArgValues[&I] = SROAArg;
1762
1765}
1766
1767bool CallAnalyzer::visitCastInst(CastInst &I) {
1768 // Propagate constants through casts.
1770 return true;
1771
1772 // Disable SROA in the face of arbitrary casts we don't explicitly list
1773 // elsewhere.
1774 disableSROA(I.getOperand(0));
1775
1776 // If this is a floating-point cast, and the target says this operation
1777 // is expensive, this may eventually become a library call. Treat the cost
1778 // as such.
1779 switch (I.getOpcode()) {
1780 case Instruction::FPTrunc:
1781 case Instruction::FPExt:
1782 case Instruction::UIToFP:
1783 case Instruction::SIToFP:
1784 case Instruction::FPToUI:
1785 case Instruction::FPToSI:
1787 onCallPenalty();
1788 break;
1789 default:
1790 break;
1791 }
1792
1795}
1796
1797bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1798 return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1799}
1800
1801bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1802 // Does the *call site* have the NonNull attribute set on an argument? We
1803 // use the attribute on the call site to memoize any analysis done in the
1804 // caller. This will also trip if the callee function has a non-null
1805 // parameter attribute, but that's a less interesting case because hopefully
1806 // the callee would already have been simplified based on that.
1807 if (Argument *A = dyn_cast<Argument>(V))
1808 if (paramHasAttr(A, Attribute::NonNull))
1809 return true;
1810
1811 // Is this an alloca in the caller? This is distinct from the attribute case
1812 // above because attributes aren't updated within the inliner itself and we
1813 // always want to catch the alloca derived case.
1814 if (isAllocaDerivedArg(V))
1815 // We can actually predict the result of comparisons between an
1816 // alloca-derived value and null. Note that this fires regardless of
1817 // SROA firing.
1818 return true;
1819
1820 return false;
1821}
1822
1823bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1824 // If the normal destination of the invoke or the parent block of the call
1825 // site is unreachable-terminated, there is little point in inlining this
1826 // unless there is literally zero cost.
1827 // FIXME: Note that it is possible that an unreachable-terminated block has a
1828 // hot entry. For example, in below scenario inlining hot_call_X() may be
1829 // beneficial :
1830 // main() {
1831 // hot_call_1();
1832 // ...
1833 // hot_call_N()
1834 // exit(0);
1835 // }
1836 // For now, we are not handling this corner case here as it is rare in real
1837 // code. In future, we should elaborate this based on BPI and BFI in more
1838 // general threshold adjusting heuristics in updateThreshold().
1839 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1840 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1841 return false;
1842 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1843 return false;
1844
1845 return true;
1846}
1847
1848bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1849 BlockFrequencyInfo *CallerBFI) {
1850 // If global profile summary is available, then callsite's coldness is
1851 // determined based on that.
1852 if (PSI && PSI->hasProfileSummary())
1853 return PSI->isColdCallSite(Call, CallerBFI);
1854
1855 // Otherwise we need BFI to be available.
1856 if (!CallerBFI)
1857 return false;
1858
1859 // Determine if the callsite is cold relative to caller's entry. We could
1860 // potentially cache the computation of scaled entry frequency, but the added
1861 // complexity is not worth it unless this scaling shows up high in the
1862 // profiles.
1863 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1864 auto CallSiteBB = Call.getParent();
1865 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1866 auto CallerEntryFreq =
1867 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1868 return CallSiteFreq < CallerEntryFreq * ColdProb;
1869}
1870
1871std::optional<int>
1872InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1873 BlockFrequencyInfo *CallerBFI) {
1874
1875 // If global profile summary is available, then callsite's hotness is
1876 // determined based on that.
1877 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1878 return Params.HotCallSiteThreshold;
1879
1880 // Otherwise we need BFI to be available and to have a locally hot callsite
1881 // threshold.
1882 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1883 return std::nullopt;
1884
1885 // Determine if the callsite is hot relative to caller's entry. We could
1886 // potentially cache the computation of scaled entry frequency, but the added
1887 // complexity is not worth it unless this scaling shows up high in the
1888 // profiles.
1889 const BasicBlock *CallSiteBB = Call.getParent();
1890 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1891 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
1892 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
1893 if (Limit && CallSiteFreq >= *Limit)
1894 return Params.LocallyHotCallSiteThreshold;
1895
1896 // Otherwise treat it normally.
1897 return std::nullopt;
1898}
1899
1900void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1901 // If no size growth is allowed for this inlining, set Threshold to 0.
1902 if (!allowSizeGrowth(Call)) {
1903 Threshold = 0;
1904 return;
1905 }
1906
1907 Function *Caller = Call.getCaller();
1908
1909 // return min(A, B) if B is valid.
1910 auto MinIfValid = [](int A, std::optional<int> B) {
1911 return B ? std::min(A, *B) : A;
1912 };
1913
1914 // return max(A, B) if B is valid.
1915 auto MaxIfValid = [](int A, std::optional<int> B) {
1916 return B ? std::max(A, *B) : A;
1917 };
1918
1919 // Various bonus percentages. These are multiplied by Threshold to get the
1920 // bonus values.
1921 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1922 // basic block at the given callsite context. This is speculatively applied
1923 // and withdrawn if more than one basic block is seen.
1924 //
1925 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1926 // of the last call to a static function as inlining such functions is
1927 // guaranteed to reduce code size.
1928 //
1929 // These bonus percentages may be set to 0 based on properties of the caller
1930 // and the callsite.
1931 int SingleBBBonusPercent = 50;
1932 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1934
1935 // Lambda to set all the above bonus and bonus percentages to 0.
1936 auto DisallowAllBonuses = [&]() {
1937 SingleBBBonusPercent = 0;
1938 VectorBonusPercent = 0;
1940 };
1941
1942 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1943 // and reduce the threshold if the caller has the necessary attribute.
1944 if (Caller->hasMinSize()) {
1945 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1946 // For minsize, we want to disable the single BB bonus and the vector
1947 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1948 // a static function will, at the minimum, eliminate the parameter setup and
1949 // call/return instructions.
1950 SingleBBBonusPercent = 0;
1951 VectorBonusPercent = 0;
1952 } else if (Caller->hasOptSize())
1953 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1954
1955 // Adjust the threshold based on inlinehint attribute and profile based
1956 // hotness information if the caller does not have MinSize attribute.
1957 if (!Caller->hasMinSize()) {
1958 if (Callee.hasFnAttribute(Attribute::InlineHint))
1959 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1960
1961 // FIXME: After switching to the new passmanager, simplify the logic below
1962 // by checking only the callsite hotness/coldness as we will reliably
1963 // have local profile information.
1964 //
1965 // Callsite hotness and coldness can be determined if sample profile is
1966 // used (which adds hotness metadata to calls) or if caller's
1967 // BlockFrequencyInfo is available.
1968 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1969 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1970 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1971 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1972 // FIXME: This should update the threshold only if it exceeds the
1973 // current threshold, but AutoFDO + ThinLTO currently relies on this
1974 // behavior to prevent inlining of hot callsites during ThinLTO
1975 // compile phase.
1976 Threshold = *HotCallSiteThreshold;
1977 } else if (isColdCallSite(Call, CallerBFI)) {
1978 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1979 // Do not apply bonuses for a cold callsite including the
1980 // LastCallToStatic bonus. While this bonus might result in code size
1981 // reduction, it can cause the size of a non-cold caller to increase
1982 // preventing it from being inlined.
1983 DisallowAllBonuses();
1984 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1985 } else if (PSI) {
1986 // Use callee's global profile information only if we have no way of
1987 // determining this via callsite information.
1988 if (PSI->isFunctionEntryHot(&Callee)) {
1989 LLVM_DEBUG(dbgs() << "Hot callee.\n");
1990 // If callsite hotness can not be determined, we may still know
1991 // that the callee is hot and treat it as a weaker hint for threshold
1992 // increase.
1993 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1994 } else if (PSI->isFunctionEntryCold(&Callee)) {
1995 LLVM_DEBUG(dbgs() << "Cold callee.\n");
1996 // Do not apply bonuses for a cold callee including the
1997 // LastCallToStatic bonus. While this bonus might result in code size
1998 // reduction, it can cause the size of a non-cold caller to increase
1999 // preventing it from being inlined.
2000 DisallowAllBonuses();
2001 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2002 }
2003 }
2004 }
2005
2006 Threshold += TTI.adjustInliningThreshold(&Call);
2007
2008 // Finally, take the target-specific inlining threshold multiplier into
2009 // account.
2010 Threshold *= TTI.getInliningThresholdMultiplier();
2011
2012 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2013 VectorBonus = Threshold * VectorBonusPercent / 100;
2014
2015 // If there is only one call of the function, and it has internal linkage,
2016 // the cost of inlining it drops dramatically. It may seem odd to update
2017 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2018 if (isSoleCallToLocalFunction(Call, F)) {
2020 StaticBonusApplied = LastCallToStaticBonus;
2021 }
2022}
2023
2024bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2025 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2026 // First try to handle simplified comparisons.
2028 return true;
2029
2030 if (I.getOpcode() == Instruction::FCmp)
2031 return false;
2032
2033 // Otherwise look for a comparison between constant offset pointers with
2034 // a common base.
2035 Value *LHSBase, *RHSBase;
2036 APInt LHSOffset, RHSOffset;
2037 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2038 if (LHSBase) {
2039 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2040 if (RHSBase && LHSBase == RHSBase) {
2041 // We have common bases, fold the icmp to a constant based on the
2042 // offsets.
2043 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2044 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2045 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
2046 SimplifiedValues[&I] = C;
2047 ++NumConstantPtrCmps;
2048 return true;
2049 }
2050 }
2051 }
2052
2053 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2054 for (auto *User : I.users())
2055 if (auto *Instr = dyn_cast<Instruction>(User))
2056 if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
2057 return false;
2058 return true;
2059 };
2060
2061 // If the comparison is an equality comparison with null, we can simplify it
2062 // if we know the value (argument) can't be null
2063 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
2064 if (isKnownNonNullInCallee(I.getOperand(0))) {
2065 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2066 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2068 return true;
2069 }
2070 // Implicit null checks act as unconditional branches and their comparisons
2071 // should be treated as simplified and free of cost.
2072 if (isImplicitNullCheckCmp(I))
2073 return true;
2074 }
2075 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2076}
2077
2078bool CallAnalyzer::visitSub(BinaryOperator &I) {
2079 // Try to handle a special case: we can fold computing the difference of two
2080 // constant-related pointers.
2081 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2082 Value *LHSBase, *RHSBase;
2083 APInt LHSOffset, RHSOffset;
2084 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2085 if (LHSBase) {
2086 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2087 if (RHSBase && LHSBase == RHSBase) {
2088 // We have common bases, fold the subtract to a constant based on the
2089 // offsets.
2090 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2091 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2092 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2093 SimplifiedValues[&I] = C;
2094 ++NumConstantPtrDiffs;
2095 return true;
2096 }
2097 }
2098 }
2099
2100 // Otherwise, fall back to the generic logic for simplifying and handling
2101 // instructions.
2102 return Base::visitSub(I);
2103}
2104
2105bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2106 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2107 Constant *CLHS = dyn_cast<Constant>(LHS);
2108 if (!CLHS)
2109 CLHS = SimplifiedValues.lookup(LHS);
2110 Constant *CRHS = dyn_cast<Constant>(RHS);
2111 if (!CRHS)
2112 CRHS = SimplifiedValues.lookup(RHS);
2113
2114 Value *SimpleV = nullptr;
2115 if (auto FI = dyn_cast<FPMathOperator>(&I))
2116 SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2117 FI->getFastMathFlags(), DL);
2118 else
2119 SimpleV =
2120 simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2121
2122 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2123 SimplifiedValues[&I] = C;
2124
2125 if (SimpleV)
2126 return true;
2127
2128 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2129 disableSROA(LHS);
2130 disableSROA(RHS);
2131
2132 // If the instruction is floating point, and the target says this operation
2133 // is expensive, this may eventually become a library call. Treat the cost
2134 // as such. Unless it's fneg which can be implemented with an xor.
2135 using namespace llvm::PatternMatch;
2136 if (I.getType()->isFloatingPointTy() &&
2138 !match(&I, m_FNeg(m_Value())))
2139 onCallPenalty();
2140
2141 return false;
2142}
2143
2144bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2145 Value *Op = I.getOperand(0);
2146 Constant *COp = dyn_cast<Constant>(Op);
2147 if (!COp)
2148 COp = SimplifiedValues.lookup(Op);
2149
2150 Value *SimpleV = simplifyFNegInst(
2151 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2152
2153 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2154 SimplifiedValues[&I] = C;
2155
2156 if (SimpleV)
2157 return true;
2158
2159 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2160 disableSROA(Op);
2161
2162 return false;
2163}
2164
2165bool CallAnalyzer::visitLoad(LoadInst &I) {
2166 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2167 return true;
2168
2169 // If the data is already loaded from this address and hasn't been clobbered
2170 // by any stores or calls, this load is likely to be redundant and can be
2171 // eliminated.
2172 if (EnableLoadElimination &&
2173 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2174 onLoadEliminationOpportunity();
2175 return true;
2176 }
2177
2178 onMemAccess();
2179 return false;
2180}
2181
2182bool CallAnalyzer::visitStore(StoreInst &I) {
2183 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2184 return true;
2185
2186 // The store can potentially clobber loads and prevent repeated loads from
2187 // being eliminated.
2188 // FIXME:
2189 // 1. We can probably keep an initial set of eliminatable loads substracted
2190 // from the cost even when we finally see a store. We just need to disable
2191 // *further* accumulation of elimination savings.
2192 // 2. We should probably at some point thread MemorySSA for the callee into
2193 // this and then use that to actually compute *really* precise savings.
2194 disableLoadElimination();
2195
2196 onMemAccess();
2197 return false;
2198}
2199
2200bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2201 // Constant folding for extract value is trivial.
2203 return true;
2204
2205 // SROA can't look through these, but they may be free.
2206 return Base::visitExtractValue(I);
2207}
2208
2209bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2210 // Constant folding for insert value is trivial.
2212 return true;
2213
2214 // SROA can't look through these, but they may be free.
2215 return Base::visitInsertValue(I);
2216}
2217
2218/// Try to simplify a call site.
2219///
2220/// Takes a concrete function and callsite and tries to actually simplify it by
2221/// analyzing the arguments and call itself with instsimplify. Returns true if
2222/// it has simplified the callsite to some other entity (a constant), making it
2223/// free.
2224bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2225 // FIXME: Using the instsimplify logic directly for this is inefficient
2226 // because we have to continually rebuild the argument list even when no
2227 // simplifications can be performed. Until that is fixed with remapping
2228 // inside of instsimplify, directly constant fold calls here.
2229 if (!canConstantFoldCallTo(&Call, F))
2230 return false;
2231
2232 // Try to re-map the arguments to constants.
2233 SmallVector<Constant *, 4> ConstantArgs;
2234 ConstantArgs.reserve(Call.arg_size());
2235 for (Value *I : Call.args()) {
2236 Constant *C = dyn_cast<Constant>(I);
2237 if (!C)
2238 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
2239 if (!C)
2240 return false; // This argument doesn't map to a constant.
2241
2242 ConstantArgs.push_back(C);
2243 }
2244 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2245 SimplifiedValues[&Call] = C;
2246 return true;
2247 }
2248
2249 return false;
2250}
2251
2252bool CallAnalyzer::visitCallBase(CallBase &Call) {
2253 if (!onCallBaseVisitStart(Call))
2254 return true;
2255
2256 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2257 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2258 // This aborts the entire analysis.
2259 ExposesReturnsTwice = true;
2260 return false;
2261 }
2262 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2263 ContainsNoDuplicateCall = true;
2264
2265 Function *F = Call.getCalledFunction();
2266 bool IsIndirectCall = !F;
2267 if (IsIndirectCall) {
2268 // Check if this happens to be an indirect function call to a known function
2269 // in this inline context. If not, we've done all we can.
2270 Value *Callee = Call.getCalledOperand();
2271 F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2272 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2273 onCallArgumentSetup(Call);
2274
2275 if (!Call.onlyReadsMemory())
2276 disableLoadElimination();
2277 return Base::visitCallBase(Call);
2278 }
2279 }
2280
2281 assert(F && "Expected a call to a known function");
2282
2283 // When we have a concrete function, first try to simplify it directly.
2284 if (simplifyCallSite(F, Call))
2285 return true;
2286
2287 // Next check if it is an intrinsic we know about.
2288 // FIXME: Lift this into part of the InstVisitor.
2289 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2290 switch (II->getIntrinsicID()) {
2291 default:
2292 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2293 disableLoadElimination();
2294 return Base::visitCallBase(Call);
2295
2296 case Intrinsic::load_relative:
2297 onLoadRelativeIntrinsic();
2298 return false;
2299
2300 case Intrinsic::memset:
2301 case Intrinsic::memcpy:
2302 case Intrinsic::memmove:
2303 disableLoadElimination();
2304 // SROA can usually chew through these intrinsics, but they aren't free.
2305 return false;
2306 case Intrinsic::icall_branch_funnel:
2307 case Intrinsic::localescape:
2308 HasUninlineableIntrinsic = true;
2309 return false;
2310 case Intrinsic::vastart:
2311 InitsVargArgs = true;
2312 return false;
2313 case Intrinsic::launder_invariant_group:
2314 case Intrinsic::strip_invariant_group:
2315 if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2316 SROAArgValues[II] = SROAArg;
2317 return true;
2318 case Intrinsic::is_constant:
2319 return simplifyIntrinsicCallIsConstant(Call);
2320 case Intrinsic::objectsize:
2321 return simplifyIntrinsicCallObjectSize(Call);
2322 }
2323 }
2324
2325 if (F == Call.getFunction()) {
2326 // This flag will fully abort the analysis, so don't bother with anything
2327 // else.
2328 IsRecursiveCall = true;
2329 if (!AllowRecursiveCall)
2330 return false;
2331 }
2332
2333 if (TTI.isLoweredToCall(F)) {
2334 onLoweredCall(F, Call, IsIndirectCall);
2335 }
2336
2337 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2338 disableLoadElimination();
2339 return Base::visitCallBase(Call);
2340}
2341
2342bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2343 // At least one return instruction will be free after inlining.
2344 bool Free = !HasReturn;
2345 HasReturn = true;
2346 return Free;
2347}
2348
2349bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2350 // We model unconditional branches as essentially free -- they really
2351 // shouldn't exist at all, but handling them makes the behavior of the
2352 // inliner more regular and predictable. Interestingly, conditional branches
2353 // which will fold away are also free.
2354 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
2355 BI.getMetadata(LLVMContext::MD_make_implicit) ||
2356 isa_and_nonnull<ConstantInt>(
2357 SimplifiedValues.lookup(BI.getCondition()));
2358}
2359
2360bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2361 bool CheckSROA = SI.getType()->isPointerTy();
2362 Value *TrueVal = SI.getTrueValue();
2363 Value *FalseVal = SI.getFalseValue();
2364
2365 Constant *TrueC = dyn_cast<Constant>(TrueVal);
2366 if (!TrueC)
2367 TrueC = SimplifiedValues.lookup(TrueVal);
2368 Constant *FalseC = dyn_cast<Constant>(FalseVal);
2369 if (!FalseC)
2370 FalseC = SimplifiedValues.lookup(FalseVal);
2371 Constant *CondC =
2372 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2373
2374 if (!CondC) {
2375 // Select C, X, X => X
2376 if (TrueC == FalseC && TrueC) {
2377 SimplifiedValues[&SI] = TrueC;
2378 return true;
2379 }
2380
2381 if (!CheckSROA)
2382 return Base::visitSelectInst(SI);
2383
2384 std::pair<Value *, APInt> TrueBaseAndOffset =
2385 ConstantOffsetPtrs.lookup(TrueVal);
2386 std::pair<Value *, APInt> FalseBaseAndOffset =
2387 ConstantOffsetPtrs.lookup(FalseVal);
2388 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2389 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2390
2391 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2392 SROAArgValues[&SI] = SROAArg;
2393 return true;
2394 }
2395
2396 return Base::visitSelectInst(SI);
2397 }
2398
2399 // Select condition is a constant.
2400 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2401 : (CondC->isNullValue()) ? FalseVal
2402 : nullptr;
2403 if (!SelectedV) {
2404 // Condition is a vector constant that is not all 1s or all 0s. If all
2405 // operands are constants, ConstantFoldSelectInstruction() can handle the
2406 // cases such as select vectors.
2407 if (TrueC && FalseC) {
2408 if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2409 SimplifiedValues[&SI] = C;
2410 return true;
2411 }
2412 }
2413 return Base::visitSelectInst(SI);
2414 }
2415
2416 // Condition is either all 1s or all 0s. SI can be simplified.
2417 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2418 SimplifiedValues[&SI] = SelectedC;
2419 return true;
2420 }
2421
2422 if (!CheckSROA)
2423 return true;
2424
2425 std::pair<Value *, APInt> BaseAndOffset =
2426 ConstantOffsetPtrs.lookup(SelectedV);
2427 if (BaseAndOffset.first) {
2428 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2429
2430 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2431 SROAArgValues[&SI] = SROAArg;
2432 }
2433
2434 return true;
2435}
2436
2437bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2438 // We model unconditional switches as free, see the comments on handling
2439 // branches.
2440 if (isa<ConstantInt>(SI.getCondition()))
2441 return true;
2442 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
2443 if (isa<ConstantInt>(V))
2444 return true;
2445
2446 // Assume the most general case where the switch is lowered into
2447 // either a jump table, bit test, or a balanced binary tree consisting of
2448 // case clusters without merging adjacent clusters with the same
2449 // destination. We do not consider the switches that are lowered with a mix
2450 // of jump table/bit test/binary search tree. The cost of the switch is
2451 // proportional to the size of the tree or the size of jump table range.
2452 //
2453 // NB: We convert large switches which are just used to initialize large phi
2454 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2455 // inlining those. It will prevent inlining in cases where the optimization
2456 // does not (yet) fire.
2457
2458 unsigned JumpTableSize = 0;
2459 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2460 unsigned NumCaseCluster =
2461 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2462
2463 onFinalizeSwitch(JumpTableSize, NumCaseCluster);
2464 return false;
2465}
2466
2467bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2468 // We never want to inline functions that contain an indirectbr. This is
2469 // incorrect because all the blockaddress's (in static global initializers
2470 // for example) would be referring to the original function, and this
2471 // indirect jump would jump from the inlined copy of the function into the
2472 // original function which is extremely undefined behavior.
2473 // FIXME: This logic isn't really right; we can safely inline functions with
2474 // indirectbr's as long as no other function or global references the
2475 // blockaddress of a block within the current function.
2476 HasIndirectBr = true;
2477 return false;
2478}
2479
2480bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2481 // FIXME: It's not clear that a single instruction is an accurate model for
2482 // the inline cost of a resume instruction.
2483 return false;
2484}
2485
2486bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2487 // FIXME: It's not clear that a single instruction is an accurate model for
2488 // the inline cost of a cleanupret instruction.
2489 return false;
2490}
2491
2492bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2493 // FIXME: It's not clear that a single instruction is an accurate model for
2494 // the inline cost of a catchret instruction.
2495 return false;
2496}
2497
2498bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2499 // FIXME: It might be reasonably to discount the cost of instructions leading
2500 // to unreachable as they have the lowest possible impact on both runtime and
2501 // code size.
2502 return true; // No actual code is needed for unreachable.
2503}
2504
2505bool CallAnalyzer::visitInstruction(Instruction &I) {
2506 // Some instructions are free. All of the free intrinsics can also be
2507 // handled by SROA, etc.
2510 return true;
2511
2512 // We found something we don't understand or can't handle. Mark any SROA-able
2513 // values in the operand list as no longer viable.
2514 for (const Use &Op : I.operands())
2515 disableSROA(Op);
2516
2517 return false;
2518}
2519
2520/// Analyze a basic block for its contribution to the inline cost.
2521///
2522/// This method walks the analyzer over every instruction in the given basic
2523/// block and accounts for their cost during inlining at this callsite. It
2524/// aborts early if the threshold has been exceeded or an impossible to inline
2525/// construct has been detected. It returns false if inlining is no longer
2526/// viable, and true if inlining remains viable.
2528CallAnalyzer::analyzeBlock(BasicBlock *BB,
2529 SmallPtrSetImpl<const Value *> &EphValues) {
2530 for (Instruction &I : *BB) {
2531 // FIXME: Currently, the number of instructions in a function regardless of
2532 // our ability to simplify them during inline to constants or dead code,
2533 // are actually used by the vector bonus heuristic. As long as that's true,
2534 // we have to special case debug intrinsics here to prevent differences in
2535 // inlining due to debug symbols. Eventually, the number of unsimplified
2536 // instructions shouldn't factor into the cost computation, but until then,
2537 // hack around it here.
2538 // Similarly, skip pseudo-probes.
2539 if (I.isDebugOrPseudoInst())
2540 continue;
2541
2542 // Skip ephemeral values.
2543 if (EphValues.count(&I))
2544 continue;
2545
2546 ++NumInstructions;
2547 if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2548 ++NumVectorInstructions;
2549
2550 // If the instruction simplified to a constant, there is no cost to this
2551 // instruction. Visit the instructions using our InstVisitor to account for
2552 // all of the per-instruction logic. The visit tree returns true if we
2553 // consumed the instruction in any way, and false if the instruction's base
2554 // cost should count against inlining.
2555 onInstructionAnalysisStart(&I);
2556
2557 if (Base::visit(&I))
2558 ++NumInstructionsSimplified;
2559 else
2560 onMissedSimplification();
2561
2562 onInstructionAnalysisFinish(&I);
2563 using namespace ore;
2564 // If the visit this instruction detected an uninlinable pattern, abort.
2566 if (IsRecursiveCall && !AllowRecursiveCall)
2567 IR = InlineResult::failure("recursive");
2568 else if (ExposesReturnsTwice)
2569 IR = InlineResult::failure("exposes returns twice");
2570 else if (HasDynamicAlloca)
2571 IR = InlineResult::failure("dynamic alloca");
2572 else if (HasIndirectBr)
2573 IR = InlineResult::failure("indirect branch");
2574 else if (HasUninlineableIntrinsic)
2575 IR = InlineResult::failure("uninlinable intrinsic");
2576 else if (InitsVargArgs)
2577 IR = InlineResult::failure("varargs");
2578 if (!IR.isSuccess()) {
2579 if (ORE)
2580 ORE->emit([&]() {
2581 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2582 &CandidateCall)
2583 << NV("Callee", &F) << " has uninlinable pattern ("
2584 << NV("InlineResult", IR.getFailureReason())
2585 << ") and cost is not fully computed";
2586 });
2587 return IR;
2588 }
2589
2590 // If the caller is a recursive function then we don't want to inline
2591 // functions which allocate a lot of stack space because it would increase
2592 // the caller stack usage dramatically.
2593 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2594 auto IR =
2595 InlineResult::failure("recursive and allocates too much stack space");
2596 if (ORE)
2597 ORE->emit([&]() {
2598 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2599 &CandidateCall)
2600 << NV("Callee", &F) << " is "
2601 << NV("InlineResult", IR.getFailureReason())
2602 << ". Cost is not fully computed";
2603 });
2604 return IR;
2605 }
2606
2607 if (shouldStop())
2608 return InlineResult::failure(
2609 "Call site analysis is not favorable to inlining.");
2610 }
2611
2612 return InlineResult::success();
2613}
2614
2615/// Compute the base pointer and cumulative constant offsets for V.
2616///
2617/// This strips all constant offsets off of V, leaving it the base pointer, and
2618/// accumulates the total constant offset applied in the returned constant. It
2619/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2620/// no constant offsets applied.
2621ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2622 if (!V->getType()->isPointerTy())
2623 return nullptr;
2624
2625 unsigned AS = V->getType()->getPointerAddressSpace();
2626 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2627 APInt Offset = APInt::getZero(IntPtrWidth);
2628
2629 // Even though we don't look through PHI nodes, we could be called on an
2630 // instruction in an unreachable block, which may be on a cycle.
2632 Visited.insert(V);
2633 do {
2634 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2635 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2636 return nullptr;
2637 V = GEP->getPointerOperand();
2638 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
2639 V = cast<Operator>(V)->getOperand(0);
2640 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2641 if (GA->isInterposable())
2642 break;
2643 V = GA->getAliasee();
2644 } else {
2645 break;
2646 }
2647 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2648 } while (Visited.insert(V).second);
2649
2650 Type *IdxPtrTy = DL.getIndexType(V->getType());
2651 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2652}
2653
2654/// Find dead blocks due to deleted CFG edges during inlining.
2655///
2656/// If we know the successor of the current block, \p CurrBB, has to be \p
2657/// NextBB, the other successors of \p CurrBB are dead if these successors have
2658/// no live incoming CFG edges. If one block is found to be dead, we can
2659/// continue growing the dead block list by checking the successors of the dead
2660/// blocks to see if all their incoming edges are dead or not.
2661void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2662 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2663 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2664 // known successor which is not the one under exam.
2665 return (DeadBlocks.count(Pred) ||
2666 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2667 };
2668
2669 auto IsNewlyDead = [&](BasicBlock *BB) {
2670 // If all the edges to a block are dead, the block is also dead.
2671 return (!DeadBlocks.count(BB) &&
2673 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2674 };
2675
2676 for (BasicBlock *Succ : successors(CurrBB)) {
2677 if (Succ == NextBB || !IsNewlyDead(Succ))
2678 continue;
2680 NewDead.push_back(Succ);
2681 while (!NewDead.empty()) {
2682 BasicBlock *Dead = NewDead.pop_back_val();
2683 if (DeadBlocks.insert(Dead).second)
2684 // Continue growing the dead block lists.
2685 for (BasicBlock *S : successors(Dead))
2686 if (IsNewlyDead(S))
2687 NewDead.push_back(S);
2688 }
2689 }
2690}
2691
2692/// Analyze a call site for potential inlining.
2693///
2694/// Returns true if inlining this call is viable, and false if it is not
2695/// viable. It computes the cost and adjusts the threshold based on numerous
2696/// factors and heuristics. If this method returns false but the computed cost
2697/// is below the computed threshold, then inlining was forcibly disabled by
2698/// some artifact of the routine.
2699InlineResult CallAnalyzer::analyze() {
2700 ++NumCallsAnalyzed;
2701
2702 auto Result = onAnalysisStart();
2703 if (!Result.isSuccess())
2704 return Result;
2705
2706 if (F.empty())
2707 return InlineResult::success();
2708
2709 Function *Caller = CandidateCall.getFunction();
2710 // Check if the caller function is recursive itself.
2711 for (User *U : Caller->users()) {
2712 CallBase *Call = dyn_cast<CallBase>(U);
2713 if (Call && Call->getFunction() == Caller) {
2714 IsCallerRecursive = true;
2715 break;
2716 }
2717 }
2718
2719 // Populate our simplified values by mapping from function arguments to call
2720 // arguments with known important simplifications.
2721 auto CAI = CandidateCall.arg_begin();
2722 for (Argument &FAI : F.args()) {
2723 assert(CAI != CandidateCall.arg_end());
2724 if (Constant *C = dyn_cast<Constant>(CAI))
2725 SimplifiedValues[&FAI] = C;
2726
2727 Value *PtrArg = *CAI;
2728 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2729 ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2730
2731 // We can SROA any pointer arguments derived from alloca instructions.
2732 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2733 SROAArgValues[&FAI] = SROAArg;
2734 onInitializeSROAArg(SROAArg);
2735 EnabledSROAAllocas.insert(SROAArg);
2736 }
2737 }
2738 ++CAI;
2739 }
2740 NumConstantArgs = SimplifiedValues.size();
2741 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2742 NumAllocaArgs = SROAArgValues.size();
2743
2744 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2745 // the ephemeral values multiple times (and they're completely determined by
2746 // the callee, so this is purely duplicate work).
2748 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2749
2750 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2751 // adding basic blocks of the callee which can be proven to be dead for this
2752 // particular call site in order to get more accurate cost estimates. This
2753 // requires a somewhat heavyweight iteration pattern: we need to walk the
2754 // basic blocks in a breadth-first order as we insert live successors. To
2755 // accomplish this, prioritizing for small iterations because we exit after
2756 // crossing our threshold, we use a small-size optimized SetVector.
2758 BBSetVector BBWorklist;
2759 BBWorklist.insert(&F.getEntryBlock());
2760
2761 // Note that we *must not* cache the size, this loop grows the worklist.
2762 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2763 if (shouldStop())
2764 break;
2765
2766 BasicBlock *BB = BBWorklist[Idx];
2767 if (BB->empty())
2768 continue;
2769
2770 onBlockStart(BB);
2771
2772 // Disallow inlining a blockaddress with uses other than strictly callbr.
2773 // A blockaddress only has defined behavior for an indirect branch in the
2774 // same function, and we do not currently support inlining indirect
2775 // branches. But, the inliner may not see an indirect branch that ends up
2776 // being dead code at a particular call site. If the blockaddress escapes
2777 // the function, e.g., via a global variable, inlining may lead to an
2778 // invalid cross-function reference.
2779 // FIXME: pr/39560: continue relaxing this overt restriction.
2780 if (BB->hasAddressTaken())
2781 for (User *U : BlockAddress::get(&*BB)->users())
2782 if (!isa<CallBrInst>(*U))
2783 return InlineResult::failure("blockaddress used outside of callbr");
2784
2785 // Analyze the cost of this block. If we blow through the threshold, this
2786 // returns false, and we can bail on out.
2787 InlineResult IR = analyzeBlock(BB, EphValues);
2788 if (!IR.isSuccess())
2789 return IR;
2790
2791 Instruction *TI = BB->getTerminator();
2792
2793 // Add in the live successors by first checking whether we have terminator
2794 // that may be simplified based on the values simplified by this call.
2795 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2796 if (BI->isConditional()) {
2797 Value *Cond = BI->getCondition();
2798 if (ConstantInt *SimpleCond =
2799 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2800 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2801 BBWorklist.insert(NextBB);
2802 KnownSuccessors[BB] = NextBB;
2803 findDeadBlocks(BB, NextBB);
2804 continue;
2805 }
2806 }
2807 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2808 Value *Cond = SI->getCondition();
2809 if (ConstantInt *SimpleCond =
2810 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2811 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2812 BBWorklist.insert(NextBB);
2813 KnownSuccessors[BB] = NextBB;
2814 findDeadBlocks(BB, NextBB);
2815 continue;
2816 }
2817 }
2818
2819 // If we're unable to select a particular successor, just count all of
2820 // them.
2821 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
2822 ++TIdx)
2823 BBWorklist.insert(TI->getSuccessor(TIdx));
2824
2825 onBlockAnalyzed(BB);
2826 }
2827
2828 // If this is a noduplicate call, we can still inline as long as
2829 // inlining this would cause the removal of the caller (so the instruction
2830 // is not actually duplicated, just moved).
2831 if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
2832 return InlineResult::failure("noduplicate");
2833
2834 // If the callee's stack size exceeds the user-specified threshold,
2835 // do not let it be inlined.
2836 // The command line option overrides a limit set in the function attributes.
2837 size_t FinalStackSizeThreshold = StackSizeThreshold;
2838 if (!StackSizeThreshold.getNumOccurrences())
2839 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2841 FinalStackSizeThreshold = *AttrMaxStackSize;
2842 if (AllocatedSize > FinalStackSizeThreshold)
2843 return InlineResult::failure("stacksize");
2844
2845 return finalizeAnalysis();
2846}
2847
2848void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2849#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2851 F.print(OS, &Writer);
2852 DEBUG_PRINT_STAT(NumConstantArgs);
2853 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2854 DEBUG_PRINT_STAT(NumAllocaArgs);
2855 DEBUG_PRINT_STAT(NumConstantPtrCmps);
2856 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2857 DEBUG_PRINT_STAT(NumInstructionsSimplified);
2858 DEBUG_PRINT_STAT(NumInstructions);
2859 DEBUG_PRINT_STAT(SROACostSavings);
2860 DEBUG_PRINT_STAT(SROACostSavingsLost);
2861 DEBUG_PRINT_STAT(LoadEliminationCost);
2862 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2864 DEBUG_PRINT_STAT(Threshold);
2865#undef DEBUG_PRINT_STAT
2866}
2867
2868#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2869/// Dump stats about this call's analysis.
2870LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2871#endif
2872
2873/// Test that there are no attribute conflicts between Caller and Callee
2874/// that prevent inlining.
2876 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2877 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2878 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2879 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2880 // object, and always returns the same object (which is overwritten on each
2881 // GetTLI call). Therefore we copy the first result.
2882 auto CalleeTLI = GetTLI(*Callee);
2883 return (IgnoreTTIInlineCompatible ||
2884 TTI.areInlineCompatible(Caller, Callee)) &&
2885 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2887 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2888}
2889
2891 int64_t Cost = 0;
2892 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2893 if (Call.isByValArgument(I)) {
2894 // We approximate the number of loads and stores needed by dividing the
2895 // size of the byval type by the target's pointer size.
2896 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2897 unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2898 unsigned AS = PTy->getAddressSpace();
2899 unsigned PointerSize = DL.getPointerSizeInBits(AS);
2900 // Ceiling division.
2901 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2902
2903 // If it generates more than 8 stores it is likely to be expanded as an
2904 // inline memcpy so we take that as an upper bound. Otherwise we assume
2905 // one load and one store per word copied.
2906 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2907 // here instead of a magic number of 8, but it's not available via
2908 // DataLayout.
2909 NumStores = std::min(NumStores, 8U);
2910
2911 Cost += 2 * NumStores * InstrCost;
2912 } else {
2913 // For non-byval arguments subtract off one instruction per call
2914 // argument.
2915 Cost += InstrCost;
2916 }
2917 }
2918 // The call instruction also disappears after inlining.
2919 Cost += InstrCost;
2920 Cost += CallPenalty;
2921 return std::min<int64_t>(Cost, INT_MAX);
2922}
2923
2925 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2926 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2927 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2930 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2931 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2932}
2933
2935 CallBase &Call, TargetTransformInfo &CalleeTTI,
2936 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2939 const InlineParams Params = {/* DefaultThreshold*/ 0,
2940 /*HintThreshold*/ {},
2941 /*ColdThreshold*/ {},
2942 /*OptSizeThreshold*/ {},
2943 /*OptMinSizeThreshold*/ {},
2944 /*HotCallSiteThreshold*/ {},
2945 /*LocallyHotCallSiteThreshold*/ {},
2946 /*ColdCallSiteThreshold*/ {},
2947 /*ComputeFullInlineCost*/ true,
2948 /*EnableDeferral*/ true};
2949
2950 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2951 GetAssumptionCache, GetBFI, PSI, ORE, true,
2952 /*IgnoreThreshold*/ true);
2953 auto R = CA.analyze();
2954 if (!R.isSuccess())
2955 return std::nullopt;
2956 return CA.getCost();
2957}
2958
2959std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2960 CallBase &Call, TargetTransformInfo &CalleeTTI,
2961 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2964 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2965 ORE, *Call.getCalledFunction(), Call);
2966 auto R = CFA.analyze();
2967 if (!R.isSuccess())
2968 return std::nullopt;
2969 return CFA.features();
2970}
2971
2973 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2974 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2975
2976 // Cannot inline indirect calls.
2977 if (!Callee)
2978 return InlineResult::failure("indirect call");
2979
2980 // When callee coroutine function is inlined into caller coroutine function
2981 // before coro-split pass,
2982 // coro-early pass can not handle this quiet well.
2983 // So we won't inline the coroutine function if it have not been unsplited
2984 if (Callee->isPresplitCoroutine())
2985 return InlineResult::failure("unsplited coroutine call");
2986
2987 // Never inline calls with byval arguments that does not have the alloca
2988 // address space. Since byval arguments can be replaced with a copy to an
2989 // alloca, the inlined code would need to be adjusted to handle that the
2990 // argument is in the alloca address space (so it is a little bit complicated
2991 // to solve).
2992 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2993 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2994 if (Call.isByValArgument(I)) {
2995 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2996 if (PTy->getAddressSpace() != AllocaAS)
2997 return InlineResult::failure("byval arguments without alloca"
2998 " address space");
2999 }
3000
3001 // Calls to functions with always-inline attributes should be inlined
3002 // whenever possible.
3003 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3004 if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3005 return InlineResult::failure("noinline call site attribute");
3006
3007 auto IsViable = isInlineViable(*Callee);
3008 if (IsViable.isSuccess())
3009 return InlineResult::success();
3010 return InlineResult::failure(IsViable.getFailureReason());
3011 }
3012
3013 // Never inline functions with conflicting attributes (unless callee has
3014 // always-inline attribute).
3015 Function *Caller = Call.getCaller();
3016 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
3017 return InlineResult::failure("conflicting attributes");
3018
3019 // Don't inline this call if the caller has the optnone attribute.
3020 if (Caller->hasOptNone())
3021 return InlineResult::failure("optnone attribute");
3022
3023 // Don't inline a function that treats null pointer as valid into a caller
3024 // that does not have this attribute.
3025 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3026 return InlineResult::failure("nullptr definitions incompatible");
3027
3028 // Don't inline functions which can be interposed at link-time.
3029 if (Callee->isInterposable())
3030 return InlineResult::failure("interposable");
3031
3032 // Don't inline functions marked noinline.
3033 if (Callee->hasFnAttribute(Attribute::NoInline))
3034 return InlineResult::failure("noinline function attribute");
3035
3036 // Don't inline call sites marked noinline.
3037 if (Call.isNoInline())
3038 return InlineResult::failure("noinline call site attribute");
3039
3040 return std::nullopt;
3041}
3042
3044 CallBase &Call, Function *Callee, const InlineParams &Params,
3045 TargetTransformInfo &CalleeTTI,
3046 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3047 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3050
3051 auto UserDecision =
3052 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3053
3054 if (UserDecision) {
3055 if (UserDecision->isSuccess())
3056 return llvm::InlineCost::getAlways("always inline attribute");
3057 return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3058 }
3059
3060 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3061 << "... (caller:" << Call.getCaller()->getName()
3062 << ")\n");
3063
3064 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3065 GetAssumptionCache, GetBFI, PSI, ORE);
3066 InlineResult ShouldInline = CA.analyze();
3067
3068 LLVM_DEBUG(CA.dump());
3069
3070 // Always make cost benefit based decision explicit.
3071 // We use always/never here since threshold is not meaningful,
3072 // as it's not what drives cost-benefit analysis.
3073 if (CA.wasDecidedByCostBenefit()) {
3074 if (ShouldInline.isSuccess())
3075 return InlineCost::getAlways("benefit over cost",
3076 CA.getCostBenefitPair());
3077 else
3078 return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3079 }
3080
3081 if (CA.wasDecidedByCostThreshold())
3082 return InlineCost::get(CA.getCost(), CA.getThreshold(),
3083 CA.getStaticBonusApplied());
3084
3085 // No details on how the decision was made, simply return always or never.
3086 return ShouldInline.isSuccess()
3087 ? InlineCost::getAlways("empty function")
3088 : InlineCost::getNever(ShouldInline.getFailureReason());
3089}
3090
3092 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3093 for (BasicBlock &BB : F) {
3094 // Disallow inlining of functions which contain indirect branches.
3095 if (isa<IndirectBrInst>(BB.getTerminator()))
3096 return InlineResult::failure("contains indirect branches");
3097
3098 // Disallow inlining of blockaddresses which are used by non-callbr
3099 // instructions.
3100 if (BB.hasAddressTaken())
3101 for (User *U : BlockAddress::get(&BB)->users())
3102 if (!isa<CallBrInst>(*U))
3103 return InlineResult::failure("blockaddress used outside of callbr");
3104
3105 for (auto &II : BB) {
3106 CallBase *Call = dyn_cast<CallBase>(&II);
3107 if (!Call)
3108 continue;
3109
3110 // Disallow recursive calls.
3111 Function *Callee = Call->getCalledFunction();
3112 if (&F == Callee)
3113 return InlineResult::failure("recursive call");
3114
3115 // Disallow calls which expose returns-twice to a function not previously
3116 // attributed as such.
3117 if (!ReturnsTwice && isa<CallInst>(Call) &&
3118 cast<CallInst>(Call)->canReturnTwice())
3119 return InlineResult::failure("exposes returns-twice attribute");
3120
3121 if (Callee)
3122 switch (Callee->getIntrinsicID()) {
3123 default:
3124 break;
3125 case llvm::Intrinsic::icall_branch_funnel:
3126 // Disallow inlining of @llvm.icall.branch.funnel because current
3127 // backend can't separate call targets from call arguments.
3128 return InlineResult::failure(
3129 "disallowed inlining of @llvm.icall.branch.funnel");
3130 case llvm::Intrinsic::localescape:
3131 // Disallow inlining functions that call @llvm.localescape. Doing this
3132 // correctly would require major changes to the inliner.
3133 return InlineResult::failure(
3134 "disallowed inlining of @llvm.localescape");
3135 case llvm::Intrinsic::vastart:
3136 // Disallow inlining of functions that initialize VarArgs with
3137 // va_start.
3138 return InlineResult::failure(
3139 "contains VarArgs initialized with va_start");
3140 }
3141 }
3142 }
3143
3144 return InlineResult::success();
3145}
3146
3147// APIs to create InlineParams based on command line flags and/or other
3148// parameters.
3149
3151 InlineParams Params;
3152
3153 // This field is the threshold to use for a callee by default. This is
3154 // derived from one or more of:
3155 // * optimization or size-optimization levels,
3156 // * a value passed to createFunctionInliningPass function, or
3157 // * the -inline-threshold flag.
3158 // If the -inline-threshold flag is explicitly specified, that is used
3159 // irrespective of anything else.
3160 if (InlineThreshold.getNumOccurrences() > 0)
3162 else
3163 Params.DefaultThreshold = Threshold;
3164
3165 // Set the HintThreshold knob from the -inlinehint-threshold.
3167
3168 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3170
3171 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3172 // populate LocallyHotCallSiteThreshold. Later, we populate
3173 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3174 // we know that optimization level is O3 (in the getInlineParams variant that
3175 // takes the opt and size levels).
3176 // FIXME: Remove this check (and make the assignment unconditional) after
3177 // addressing size regression issues at O2.
3178 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3180
3181 // Set the ColdCallSiteThreshold knob from the
3182 // -inline-cold-callsite-threshold.
3184
3185 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3186 // -inlinehint-threshold commandline option is not explicitly given. If that
3187 // option is present, then its value applies even for callees with size and
3188 // minsize attributes.
3189 // If the -inline-threshold is not specified, set the ColdThreshold from the
3190 // -inlinecold-threshold even if it is not explicitly passed. If
3191 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3192 // explicitly specified to set the ColdThreshold knob
3193 if (InlineThreshold.getNumOccurrences() == 0) {
3197 } else if (ColdThreshold.getNumOccurrences() > 0) {
3199 }
3200 return Params;
3201}
3202
3205}
3206
3207// Compute the default threshold for inlining based on the opt level and the
3208// size opt level.
3209static int computeThresholdFromOptLevels(unsigned OptLevel,
3210 unsigned SizeOptLevel) {
3211 if (OptLevel > 2)
3213 if (SizeOptLevel == 1) // -Os
3215 if (SizeOptLevel == 2) // -Oz
3217 return DefaultThreshold;
3218}
3219
3220InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3221 auto Params =
3222 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3223 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3224 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3225 // when it is specified explicitly.
3226 if (OptLevel > 2)
3228 return Params;
3229}
3230
3235 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3236 [&](Function &F) -> AssumptionCache & {
3238 };
3239 Module *M = F.getParent();
3240 ProfileSummaryInfo PSI(*M);
3241 DataLayout DL(M);
3243 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3244 // In the current implementation, the type of InlineParams doesn't matter as
3245 // the pass serves only for verification of inliner's decisions.
3246 // We can add a flag which determines InlineParams for this run. Right now,
3247 // the default InlineParams are used.
3248 const InlineParams Params = llvm::getInlineParams();
3249 for (BasicBlock &BB : F) {
3250 for (Instruction &I : BB) {
3251 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
3252 Function *CalledFunction = CI->getCalledFunction();
3253 if (!CalledFunction || CalledFunction->isDeclaration())
3254 continue;
3255 OptimizationRemarkEmitter ORE(CalledFunction);
3256 InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3257 GetAssumptionCache, nullptr, &PSI, &ORE);
3258 ICCA.analyze();
3259 OS << " Analyzing call of " << CalledFunction->getName()
3260 << "... (caller:" << CI->getCaller()->getName() << ")\n";
3261 ICCA.print(OS);
3262 OS << "\n";
3263 }
3264 }
3265 }
3266 return PreservedAnalyses::all();
3267}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
SetVector< BasicBlock * > BBSetVector
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:510
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI)
Return true if the block containing the call site has a BlockFrequency of less than ColdCCRelFreq% of...
Definition: GlobalOpt.cpp:1714
Hexagon Common GEP
iv users
Definition: IVUsers.cpp:48
static cl::opt< int > InlineSavingsMultiplier("inline-savings-multiplier", cl::Hidden, cl::init(8), cl::desc("Multiplier to multiply cycle savings by during inlining"))
static cl::opt< int > InlineThreshold("inline-threshold", cl::Hidden, cl::init(225), cl::desc("Control the amount of inlining to perform (default = 225)"))
static cl::opt< int > CallPenalty("inline-call-penalty", cl::Hidden, cl::init(25), cl::desc("Call penalty that is applied per callsite when inlining"))
static cl::opt< int > HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), cl::desc("Threshold for hot callsites "))
static cl::opt< int > ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining functions with cold attribute"))
static cl::opt< size_t > RecurStackSizeThreshold("recursive-inline-max-stacksize", cl::Hidden, cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller), cl::desc("Do not inline recursive functions with a stack " "size that exceeds the specified limit"))
static cl::opt< bool > PrintInstructionComments("print-instruction-comments", cl::Hidden, cl::init(false), cl::desc("Prints comments for instruction based on inline cost analysis"))
static cl::opt< int > LocallyHotCallSiteThreshold("locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::desc("Threshold for locally hot callsites "))
static cl::opt< bool > InlineCallerSupersetNoBuiltin("inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true), cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " "attributes."))
static cl::opt< int > HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint"))
static cl::opt< size_t > StackSizeThreshold("inline-max-stacksize", cl::Hidden, cl::init(std::numeric_limits< size_t >::max()), cl::desc("Do not inline functions with a stack size " "that exceeds the specified limit"))
static int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel)
static cl::opt< uint64_t > HotCallSiteRelFreq("hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::desc("Minimum block frequency, expressed as a multiple of caller's " "entry frequency, for a callsite to be hot in the absence of " "profile information."))
static cl::opt< int > InlineSavingsProfitableMultiplier("inline-savings-profitable-multiplier", cl::Hidden, cl::init(4), cl::desc("A multiplier on top of cycle savings to decide whether the " "savings won't justify the cost"))
static cl::opt< int > MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0), cl::desc("Cost of load/store instruction when inlining"))
static cl::opt< int > ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites"))
static cl::opt< bool > IgnoreTTIInlineCompatible("ignore-tti-inline-compatible", cl::Hidden, cl::init(false), cl::desc("Ignore TTI attributes compatibility check between callee/caller " "during inline cost calculation"))
static cl::opt< bool > OptComputeFullInlineCost("inline-cost-full", cl::Hidden, cl::desc("Compute the full inline cost of a call site even when the cost " "exceeds the threshold."))
#define DEBUG_PRINT_STAT(x)
static cl::opt< bool > InlineEnableCostBenefitAnalysis("inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false), cl::desc("Enable the cost-benefit analysis for the inliner"))
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static cl::opt< int > InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100), cl::desc("The maximum size of a callee that get's " "inlined without sufficient cycle savings"))
static bool functionsHaveCompatibleAttributes(Function *Caller, Function *Callee, TargetTransformInfo &TTI, function_ref< const TargetLibraryInfo &(Function &)> &GetTLI)
Test that there are no attribute conflicts between Caller and Callee that prevent inlining.
#define DEBUG_TYPE
Definition: InlineCost.cpp:52
static cl::opt< int > ColdCallSiteRelFreq("cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::desc("Maximum block frequency, expressed as a percentage of caller's " "entry frequency, for a callsite to be cold in the absence of " "profile information."))
static cl::opt< bool > DisableGEPConstOperand("disable-gep-const-evaluation", cl::Hidden, cl::init(false), cl::desc("Disables evaluation of GetElementPtr with constant operands"))
static cl::opt< int > DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225), cl::desc("Default amount of inlining to perform"))
static Constant * getFalse(Type *Ty)
For a boolean type or a vector of boolean type, return false or a vector with every element false.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
#define P(N)
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1579
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1083
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1193
an instruction to allocate memory on the stack
Definition: Instructions.h:58
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
virtual void emitInstructionAnnot(const Instruction *, formatted_raw_ostream &)
emitInstructionAnnot - This may be implemented to emit a string right before an instruction is emitte...
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:318
AttrKind
This enumeration lists the attributes that can be associated with parameters, function results,...
Definition: Attributes.h:84
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:184
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
bool empty() const
Definition: BasicBlock.h:346
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches,...
Definition: BasicBlock.h:516
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
This class represents a no-op cast from one type to another.
static BlockAddress * get(Function *F, BasicBlock *BB)
Return a BlockAddress for the specified function and basic block.
Definition: Constants.cpp:1762
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
std::optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
BlockFrequency getBlockFreq(const BasicBlock *BB) const
getblockFreq - Return block frequency.
std::optional< BlockFrequency > mul(uint64_t Factor) const
Multiplies frequency with Factor. Returns nullopt in case of overflow.
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1190
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1412
bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Determine whether the argument or parameter has the given attribute.
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1332
Attribute getFnAttr(StringRef Kind) const
Get the attribute of a given kind for the function.
Definition: InstrTypes.h:1635
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1357
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1338
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1270
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:428
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:701
@ ICMP_NE
not equal
Definition: InstrTypes.h:733
static Constant * getSub(Constant *C1, Constant *C2, bool HasNUW=false, bool HasNSW=false)
Definition: Constants.cpp:2595
static Constant * getICmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced=false)
get* - Return some common constants without having to specify the full Instruction::OPCODE identifier...
Definition: Constants.cpp:2455
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:833
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:197
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:145
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:136
This is an important base class in LLVM.
Definition: Constant.h:41
bool isAllOnesValue() const
Return true if this is the value that would be returned by getAllOnesValue.
Definition: Constants.cpp:93
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:76
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
This instruction extracts a struct member or array element value from an aggregate value.
Type * getReturnType() const
Definition: DerivedTypes.h:124
Class to represent profile counts.
Definition: Function.h:254
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:273
Indirect Branch Instruction.
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
InlineResult is basically true or false.
Definition: InlineCost.h:179
static InlineResult success()
Definition: InlineCost.h:184
static InlineResult failure(const char *Reason)
Definition: InlineCost.h:185
bool isSuccess() const
Definition: InlineCost.h:188
const char * getFailureReason() const
Definition: InlineCost.h:189
This instruction inserts a struct field of array element value into an aggregate value.
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIndirectBrInst(IndirectBrInst &I)
Definition: InstVisitor.h:235
RetTy visitCmpInst(CmpInst &I)
Definition: InstVisitor.h:262
RetTy visitCallBase(CallBase &I)
Definition: InstVisitor.h:267
RetTy visitCleanupReturnInst(CleanupReturnInst &I)
Definition: InstVisitor.h:244
RetTy visitUnreachableInst(UnreachableInst &I)
Definition: InstVisitor.h:241
RetTy visitSwitchInst(SwitchInst &I)
Definition: InstVisitor.h:232
void visit(Iterator Start, Iterator End)
Definition: InstVisitor.h:87
RetTy visitReturnInst(ReturnInst &I)
Definition: InstVisitor.h:226
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:261
RetTy visitResumeInst(ResumeInst &I)
Definition: InstVisitor.h:238
RetTy visitCatchReturnInst(CatchReturnInst &I)
Definition: InstVisitor.h:247
RetTy visitCastInst(CastInst &I)
Definition: InstVisitor.h:259
RetTy visitBranchInst(BranchInst &I)
Definition: InstVisitor.h:229
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:189
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:280
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const BasicBlock * getParent() const
Definition: Instruction.h:90
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:75
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:302
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
This class represents a cast from an integer to a pointer.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Invoke instruction.
An instruction for reading from memory.
Definition: Instructions.h:177
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Class to represent pointers.
Definition: DerivedTypes.h:643
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:690
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
Analysis providing profile information.
This class represents a cast from a pointer to an integer.
Resume the propagation of an exception.
Return a value (possibly void), from a function.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
size_type size() const
Definition: SmallPtrSet.h:93
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:345
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:384
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
void reserve(size_type N)
Definition: SmallVector.h:667
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:474
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition: DataLayout.h:622
TypeSize getElementOffset(unsigned Idx) const
Definition: DataLayout.h:651
Class to represent struct types.
Definition: DerivedTypes.h:213
Multiway switch.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
@ TCK_SizeAndLatency
The weighted sum of size and latency.
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
unsigned getInliningThresholdMultiplier() const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getFPOpCost(Type *Ty) const
Return the expected cost of supporting the floating point operation of the specified type.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1069
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
formatted_raw_ostream - A raw_ostream that wraps another one and keeps track of line and column posit...
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
bool areInlineCompatible(const Function &Caller, const Function &Callee)
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const int ColdccPenalty
Definition: InlineCost.h:51
const char FunctionInlineCostMultiplierAttributeName[]
Definition: InlineCost.h:59
const int OptSizeThreshold
Use when optsize (-Os) is specified.
Definition: InlineCost.h:38
const int OptMinSizeThreshold
Use when minsize (-Oz) is specified.
Definition: InlineCost.h:41
const int LoopPenalty
Definition: InlineCost.h:49
const uint64_t MaxSimplifiedDynamicAllocaToInline
Do not inline dynamic allocas that have been constant propagated to be static allocas above this amou...
Definition: InlineCost.h:57
const int IndirectCallThreshold
Definition: InlineCost.h:48
const int OptAggressiveThreshold
Use when -O3 is specified.
Definition: InlineCost.h:44
const char MaxInlineStackSizeAttributeName[]
Definition: InlineCost.h:62
const int LastCallToStaticBonus
Definition: InlineCost.h:50
const unsigned TotalAllocaSizeRecursiveCaller
Do not inline functions which allocate this many bytes on the stack when the caller is recursive.
Definition: InlineCost.h:54
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
@ Dead
Unused definition.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:440
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1727
Constant * ConstantFoldSelectInstruction(Constant *Cond, Constant *V1, Constant *V2)
Attempt to constant fold a select instruction with the specified operands.
bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
bool canConstantFoldCallTo(const CallBase *Call, const Function *F)
canConstantFoldCallTo - Return true if its even possible to fold a call to the specified function.
Function::ProfileCount ProfileCount
std::optional< int > getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind)
Definition: InlineCost.cpp:186
auto successors(const MachineBasicBlock *BB)
int getCallsiteCost(const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
Value * lowerObjectSizeCall(IntrinsicInst *ObjectSize, const DataLayout &DL, const TargetLibraryInfo *TLI, bool MustSucceed)
Try to turn a call to @llvm.objectsize into an integer value of the given Type.
std::optional< InlineCostFeatures > getInliningCostFeatures(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the expanded cost features.
gep_type_iterator gep_type_end(const User *GEP)
Constant * ConstantFoldCall(const CallBase *Call, Function *F, ArrayRef< Constant * > Operands, const TargetLibraryInfo *TLI=nullptr)
ConstantFoldCall - Attempt to constant fold a call to the specified function with the specified argum...
Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Value * simplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q)
Given operand for an FNeg, fold the result or return null.
Constant * ConstantFoldInstOperands(Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed=nullptr)
Multiply two unsigned integers, X and Y, and add the unsigned integer, A to the product.
Definition: MathExtras.h:550
std::array< int, static_cast< size_t >(InlineCostFeatureIndex::NumberOfFeatures)> InlineCostFeatures
TargetTransformInfo TTI
std::optional< InlineResult > getAttributeBasedInliningDecision(CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, function_ref< const TargetLibraryInfo &(Function &)> GetTLI)
Returns InlineResult::success() if the call site should be always inlined because of user directives,...
Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
std::optional< int > getInliningCostEstimate(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the cost estimate ignoring thresholds.
gep_type_iterator gep_type_begin(const User *GEP)
auto predecessors(const MachineBasicBlock *BB)
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingAdd(T X, T Y, bool *ResultOverflowed=nullptr)
Add two unsigned integers, X and Y, of type T.
Definition: MathExtras.h:475
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< int > OptMinSizeThreshold
Threshold to use when the caller is optimized for minsize.
Definition: InlineCost.h:219
std::optional< int > OptSizeThreshold
Threshold to use when the caller is optimized for size.
Definition: InlineCost.h:216
std::optional< int > ColdCallSiteThreshold
Threshold to use when the callsite is considered cold.
Definition: InlineCost.h:229
std::optional< int > ColdThreshold
Threshold to use for cold callees.
Definition: InlineCost.h:213
std::optional< int > HotCallSiteThreshold
Threshold to use when the callsite is considered hot.
Definition: InlineCost.h:222
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
int DefaultThreshold
The default threshold to start with for a callee.
Definition: InlineCost.h:207
std::optional< int > HintThreshold
Threshold to use for callees with inline hint.
Definition: InlineCost.h:210
std::optional< int > LocallyHotCallSiteThreshold
Threshold to use when the callsite is considered hot relative to function entry.
Definition: InlineCost.h:226