LLVM 17.0.0git
InlineCost.cpp
Go to the documentation of this file.
1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
18#include "llvm/ADT/Statistic.h"
31#include "llvm/Config/llvm-config.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/Dominators.h"
37#include "llvm/IR/GlobalAlias.h"
38#include "llvm/IR/InstVisitor.h"
40#include "llvm/IR/Operator.h"
43#include "llvm/Support/Debug.h"
46#include <climits>
47#include <limits>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "inline-cost"
53
54STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
55
56static cl::opt<int>
57 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
58 cl::desc("Default amount of inlining to perform"));
59
60// We introduce this option since there is a minor compile-time win by avoiding
61// addition of TTI attributes (target-features in particular) to inline
62// candidates when they are guaranteed to be the same as top level methods in
63// some use cases. If we avoid adding the attribute, we need an option to avoid
64// checking these attributes.
66 "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
67 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68 "during inline cost calculation"));
69
71 "print-instruction-comments", cl::Hidden, cl::init(false),
72 cl::desc("Prints comments for instruction based on inline cost analysis"));
73
75 "inline-threshold", cl::Hidden, cl::init(225),
76 cl::desc("Control the amount of inlining to perform (default = 225)"));
77
79 "inlinehint-threshold", cl::Hidden, cl::init(325),
80 cl::desc("Threshold for inlining functions with inline hint"));
81
82static cl::opt<int>
83 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84 cl::init(45),
85 cl::desc("Threshold for inlining cold callsites"));
86
88 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
89 cl::desc("Enable the cost-benefit analysis for the inliner"));
90
92 "inline-savings-multiplier", cl::Hidden, cl::init(8),
93 cl::desc("Multiplier to multiply cycle savings by during inlining"));
94
95static cl::opt<int>
96 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
97 cl::desc("The maximum size of a callee that get's "
98 "inlined without sufficient cycle savings"));
99
100// We introduce this threshold to help performance of instrumentation based
101// PGO before we actually hook up inliner with analysis passes such as BPI and
102// BFI.
104 "inlinecold-threshold", cl::Hidden, cl::init(45),
105 cl::desc("Threshold for inlining functions with cold attribute"));
106
107static cl::opt<int>
108 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
109 cl::desc("Threshold for hot callsites "));
110
112 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
113 cl::desc("Threshold for locally hot callsites "));
114
116 "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
117 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
118 "entry frequency, for a callsite to be cold in the absence of "
119 "profile information."));
120
122 "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
123 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
124 "entry frequency, for a callsite to be hot in the absence of "
125 "profile information."));
126
127static cl::opt<int>
128 InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
129 cl::desc("Cost of a single instruction when inlining"));
130
131static cl::opt<int>
132 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
133 cl::desc("Cost of load/store instruction when inlining"));
134
136 "inline-call-penalty", cl::Hidden, cl::init(25),
137 cl::desc("Call penalty that is applied per callsite when inlining"));
138
139static cl::opt<size_t>
140 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
141 cl::init(std::numeric_limits<size_t>::max()),
142 cl::desc("Do not inline functions with a stack size "
143 "that exceeds the specified limit"));
144
146 "recursive-inline-max-stacksize", cl::Hidden,
148 cl::desc("Do not inline recursive functions with a stack "
149 "size that exceeds the specified limit"));
150
152 "inline-cost-full", cl::Hidden,
153 cl::desc("Compute the full inline cost of a call site even when the cost "
154 "exceeds the threshold."));
155
157 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
158 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
159 "attributes."));
160
162 "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
163 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
164
165namespace llvm {
166std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
167 if (Attr.isValid()) {
168 int AttrValue = 0;
169 if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
170 return AttrValue;
171 }
172 return std::nullopt;
173}
174
175std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
176 return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
177}
178
179std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
180 return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
181}
182
183namespace InlineConstants {
184int getInstrCost() { return InstrCost; }
185
186} // namespace InlineConstants
187
188} // namespace llvm
189
190namespace {
191class InlineCostCallAnalyzer;
192
193// This struct is used to store information about inline cost of a
194// particular instruction
195struct InstructionCostDetail {
196 int CostBefore = 0;
197 int CostAfter = 0;
198 int ThresholdBefore = 0;
199 int ThresholdAfter = 0;
200
201 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
202
203 int getCostDelta() const { return CostAfter - CostBefore; }
204
205 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
206};
207
208class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
209private:
210 InlineCostCallAnalyzer *const ICCA;
211
212public:
213 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
215 formatted_raw_ostream &OS) override;
216};
217
218/// Carry out call site analysis, in order to evaluate inlinability.
219/// NOTE: the type is currently used as implementation detail of functions such
220/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
221/// expectation is that they come from the outer scope, from the wrapper
222/// functions. If we want to support constructing CallAnalyzer objects where
223/// lambdas are provided inline at construction, or where the object needs to
224/// otherwise survive past the scope of the provided functions, we need to
225/// revisit the argument types.
226class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
228 friend class InstVisitor<CallAnalyzer, bool>;
229
230protected:
231 virtual ~CallAnalyzer() = default;
232 /// The TargetTransformInfo available for this compilation.
234
235 /// Getter for the cache of @llvm.assume intrinsics.
236 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
237
238 /// Getter for BlockFrequencyInfo
240
241 /// Profile summary information.
243
244 /// The called function.
245 Function &F;
246
247 // Cache the DataLayout since we use it a lot.
248 const DataLayout &DL;
249
250 /// The OptimizationRemarkEmitter available for this compilation.
252
253 /// The candidate callsite being analyzed. Please do not use this to do
254 /// analysis in the caller function; we want the inline cost query to be
255 /// easily cacheable. Instead, use the cover function paramHasAttr.
256 CallBase &CandidateCall;
257
258 /// Extension points for handling callsite features.
259 // Called before a basic block was analyzed.
260 virtual void onBlockStart(const BasicBlock *BB) {}
261
262 /// Called after a basic block was analyzed.
263 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
264
265 /// Called before an instruction was analyzed
266 virtual void onInstructionAnalysisStart(const Instruction *I) {}
267
268 /// Called after an instruction was analyzed
269 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
270
271 /// Called at the end of the analysis of the callsite. Return the outcome of
272 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
273 /// the reason it can't.
274 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
275 /// Called when we're about to start processing a basic block, and every time
276 /// we are done processing an instruction. Return true if there is no point in
277 /// continuing the analysis (e.g. we've determined already the call site is
278 /// too expensive to inline)
279 virtual bool shouldStop() { return false; }
280
281 /// Called before the analysis of the callee body starts (with callsite
282 /// contexts propagated). It checks callsite-specific information. Return a
283 /// reason analysis can't continue if that's the case, or 'true' if it may
284 /// continue.
285 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
286 /// Called if the analysis engine decides SROA cannot be done for the given
287 /// alloca.
288 virtual void onDisableSROA(AllocaInst *Arg) {}
289
290 /// Called the analysis engine determines load elimination won't happen.
291 virtual void onDisableLoadElimination() {}
292
293 /// Called when we visit a CallBase, before the analysis starts. Return false
294 /// to stop further processing of the instruction.
295 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
296
297 /// Called to account for a call.
298 virtual void onCallPenalty() {}
299
300 /// Called to account for a load or store.
301 virtual void onMemAccess(){};
302
303 /// Called to account for the expectation the inlining would result in a load
304 /// elimination.
305 virtual void onLoadEliminationOpportunity() {}
306
307 /// Called to account for the cost of argument setup for the Call in the
308 /// callee's body (not the callsite currently under analysis).
309 virtual void onCallArgumentSetup(const CallBase &Call) {}
310
311 /// Called to account for a load relative intrinsic.
312 virtual void onLoadRelativeIntrinsic() {}
313
314 /// Called to account for a lowered call.
315 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
316 }
317
318 /// Account for a jump table of given size. Return false to stop further
319 /// processing the switch instruction
320 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
321
322 /// Account for a case cluster of given size. Return false to stop further
323 /// processing of the instruction.
324 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
325
326 /// Called at the end of processing a switch instruction, with the given
327 /// number of case clusters.
328 virtual void onFinalizeSwitch(unsigned JumpTableSize,
329 unsigned NumCaseCluster) {}
330
331 /// Called to account for any other instruction not specifically accounted
332 /// for.
333 virtual void onMissedSimplification() {}
334
335 /// Start accounting potential benefits due to SROA for the given alloca.
336 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
337
338 /// Account SROA savings for the AllocaInst value.
339 virtual void onAggregateSROAUse(AllocaInst *V) {}
340
341 bool handleSROA(Value *V, bool DoNotDisable) {
342 // Check for SROA candidates in comparisons.
343 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
344 if (DoNotDisable) {
345 onAggregateSROAUse(SROAArg);
346 return true;
347 }
348 disableSROAForArg(SROAArg);
349 }
350 return false;
351 }
352
353 bool IsCallerRecursive = false;
354 bool IsRecursiveCall = false;
355 bool ExposesReturnsTwice = false;
356 bool HasDynamicAlloca = false;
357 bool ContainsNoDuplicateCall = false;
358 bool HasReturn = false;
359 bool HasIndirectBr = false;
360 bool HasUninlineableIntrinsic = false;
361 bool InitsVargArgs = false;
362
363 /// Number of bytes allocated statically by the callee.
364 uint64_t AllocatedSize = 0;
365 unsigned NumInstructions = 0;
366 unsigned NumVectorInstructions = 0;
367
368 /// While we walk the potentially-inlined instructions, we build up and
369 /// maintain a mapping of simplified values specific to this callsite. The
370 /// idea is to propagate any special information we have about arguments to
371 /// this call through the inlinable section of the function, and account for
372 /// likely simplifications post-inlining. The most important aspect we track
373 /// is CFG altering simplifications -- when we prove a basic block dead, that
374 /// can cause dramatic shifts in the cost of inlining a function.
375 DenseMap<Value *, Constant *> SimplifiedValues;
376
377 /// Keep track of the values which map back (through function arguments) to
378 /// allocas on the caller stack which could be simplified through SROA.
380
381 /// Keep track of Allocas for which we believe we may get SROA optimization.
382 DenseSet<AllocaInst *> EnabledSROAAllocas;
383
384 /// Keep track of values which map to a pointer base and constant offset.
386
387 /// Keep track of dead blocks due to the constant arguments.
389
390 /// The mapping of the blocks to their known unique successors due to the
391 /// constant arguments.
393
394 /// Model the elimination of repeated loads that is expected to happen
395 /// whenever we simplify away the stores that would otherwise cause them to be
396 /// loads.
397 bool EnableLoadElimination = true;
398
399 /// Whether we allow inlining for recursive call.
400 bool AllowRecursiveCall = false;
401
402 SmallPtrSet<Value *, 16> LoadAddrSet;
403
404 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
405 auto It = SROAArgValues.find(V);
406 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
407 return nullptr;
408 return It->second;
409 }
410
411 // Custom simplification helper routines.
412 bool isAllocaDerivedArg(Value *V);
413 void disableSROAForArg(AllocaInst *SROAArg);
414 void disableSROA(Value *V);
415 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
416 void disableLoadElimination();
417 bool isGEPFree(GetElementPtrInst &GEP);
418 bool canFoldInboundsGEP(GetElementPtrInst &I);
419 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
420 bool simplifyCallSite(Function *F, CallBase &Call);
422 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
423 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
424 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
425
426 /// Return true if the given argument to the function being considered for
427 /// inlining has the given attribute set either at the call site or the
428 /// function declaration. Primarily used to inspect call site specific
429 /// attributes since these can be more precise than the ones on the callee
430 /// itself.
431 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
432
433 /// Return true if the given value is known non null within the callee if
434 /// inlined through this particular callsite.
435 bool isKnownNonNullInCallee(Value *V);
436
437 /// Return true if size growth is allowed when inlining the callee at \p Call.
438 bool allowSizeGrowth(CallBase &Call);
439
440 // Custom analysis routines.
441 InlineResult analyzeBlock(BasicBlock *BB,
443
444 // Disable several entry points to the visitor so we don't accidentally use
445 // them by declaring but not defining them here.
446 void visit(Module *);
447 void visit(Module &);
448 void visit(Function *);
449 void visit(Function &);
450 void visit(BasicBlock *);
451 void visit(BasicBlock &);
452
453 // Provide base case for our instruction visit.
455
456 // Our visit overrides.
457 bool visitAlloca(AllocaInst &I);
458 bool visitPHI(PHINode &I);
459 bool visitGetElementPtr(GetElementPtrInst &I);
460 bool visitBitCast(BitCastInst &I);
461 bool visitPtrToInt(PtrToIntInst &I);
462 bool visitIntToPtr(IntToPtrInst &I);
463 bool visitCastInst(CastInst &I);
464 bool visitCmpInst(CmpInst &I);
465 bool visitSub(BinaryOperator &I);
467 bool visitFNeg(UnaryOperator &I);
468 bool visitLoad(LoadInst &I);
469 bool visitStore(StoreInst &I);
470 bool visitExtractValue(ExtractValueInst &I);
471 bool visitInsertValue(InsertValueInst &I);
472 bool visitCallBase(CallBase &Call);
473 bool visitReturnInst(ReturnInst &RI);
474 bool visitBranchInst(BranchInst &BI);
475 bool visitSelectInst(SelectInst &SI);
476 bool visitSwitchInst(SwitchInst &SI);
478 bool visitResumeInst(ResumeInst &RI);
482
483public:
484 CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
485 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
486 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
487 ProfileSummaryInfo *PSI = nullptr,
488 OptimizationRemarkEmitter *ORE = nullptr)
489 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
490 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
491 CandidateCall(Call) {}
492
493 InlineResult analyze();
494
495 std::optional<Constant *> getSimplifiedValue(Instruction *I) {
496 if (SimplifiedValues.contains(I))
497 return SimplifiedValues[I];
498 return std::nullopt;
499 }
500
501 // Keep a bunch of stats about the cost savings found so we can print them
502 // out when debugging.
503 unsigned NumConstantArgs = 0;
504 unsigned NumConstantOffsetPtrArgs = 0;
505 unsigned NumAllocaArgs = 0;
506 unsigned NumConstantPtrCmps = 0;
507 unsigned NumConstantPtrDiffs = 0;
508 unsigned NumInstructionsSimplified = 0;
509
510 void dump();
511};
512
513// Considering forming a binary search, we should find the number of nodes
514// which is same as the number of comparisons when lowered. For a given
515// number of clusters, n, we can define a recursive function, f(n), to find
516// the number of nodes in the tree. The recursion is :
517// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
518// and f(n) = n, when n <= 3.
519// This will lead a binary tree where the leaf should be either f(2) or f(3)
520// when n > 3. So, the number of comparisons from leaves should be n, while
521// the number of non-leaf should be :
522// 2^(log2(n) - 1) - 1
523// = 2^log2(n) * 2^-1 - 1
524// = n / 2 - 1.
525// Considering comparisons from leaf and non-leaf nodes, we can estimate the
526// number of comparisons in a simple closed form :
527// n + n / 2 - 1 = n * 3 / 2 - 1
528int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
529 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
530}
531
532/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
533/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
534class InlineCostCallAnalyzer final : public CallAnalyzer {
535 const bool ComputeFullInlineCost;
536 int LoadEliminationCost = 0;
537 /// Bonus to be applied when percentage of vector instructions in callee is
538 /// high (see more details in updateThreshold).
539 int VectorBonus = 0;
540 /// Bonus to be applied when the callee has only one reachable basic block.
541 int SingleBBBonus = 0;
542
543 /// Tunable parameters that control the analysis.
544 const InlineParams &Params;
545
546 // This DenseMap stores the delta change in cost and threshold after
547 // accounting for the given instruction. The map is filled only with the
548 // flag PrintInstructionComments on.
550
551 /// Upper bound for the inlining cost. Bonuses are being applied to account
552 /// for speculative "expected profit" of the inlining decision.
553 int Threshold = 0;
554
555 /// The amount of StaticBonus applied.
556 int StaticBonusApplied = 0;
557
558 /// Attempt to evaluate indirect calls to boost its inline cost.
559 const bool BoostIndirectCalls;
560
561 /// Ignore the threshold when finalizing analysis.
562 const bool IgnoreThreshold;
563
564 // True if the cost-benefit-analysis-based inliner is enabled.
565 const bool CostBenefitAnalysisEnabled;
566
567 /// Inlining cost measured in abstract units, accounts for all the
568 /// instructions expected to be executed for a given function invocation.
569 /// Instructions that are statically proven to be dead based on call-site
570 /// arguments are not counted here.
571 int Cost = 0;
572
573 // The cumulative cost at the beginning of the basic block being analyzed. At
574 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
575 // the size of that basic block.
576 int CostAtBBStart = 0;
577
578 // The static size of live but cold basic blocks. This is "static" in the
579 // sense that it's not weighted by profile counts at all.
580 int ColdSize = 0;
581
582 // Whether inlining is decided by cost-threshold analysis.
583 bool DecidedByCostThreshold = false;
584
585 // Whether inlining is decided by cost-benefit analysis.
586 bool DecidedByCostBenefit = false;
587
588 // The cost-benefit pair computed by cost-benefit analysis.
589 std::optional<CostBenefitPair> CostBenefit;
590
591 bool SingleBB = true;
592
593 unsigned SROACostSavings = 0;
594 unsigned SROACostSavingsLost = 0;
595
596 /// The mapping of caller Alloca values to their accumulated cost savings. If
597 /// we have to disable SROA for one of the allocas, this tells us how much
598 /// cost must be added.
599 DenseMap<AllocaInst *, int> SROAArgCosts;
600
601 /// Return true if \p Call is a cold callsite.
602 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
603
604 /// Update Threshold based on callsite properties such as callee
605 /// attributes and callee hotness for PGO builds. The Callee is explicitly
606 /// passed to support analyzing indirect calls whose target is inferred by
607 /// analysis.
608 void updateThreshold(CallBase &Call, Function &Callee);
609 /// Return a higher threshold if \p Call is a hot callsite.
610 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
611 BlockFrequencyInfo *CallerBFI);
612
613 /// Handle a capped 'int' increment for Cost.
614 void addCost(int64_t Inc) {
615 Inc = std::max<int64_t>(std::min<int64_t>(INT_MAX, Inc), INT_MIN);
616 Cost = std::max<int64_t>(std::min<int64_t>(INT_MAX, Inc + Cost), INT_MIN);
617 }
618
619 void onDisableSROA(AllocaInst *Arg) override {
620 auto CostIt = SROAArgCosts.find(Arg);
621 if (CostIt == SROAArgCosts.end())
622 return;
623 addCost(CostIt->second);
624 SROACostSavings -= CostIt->second;
625 SROACostSavingsLost += CostIt->second;
626 SROAArgCosts.erase(CostIt);
627 }
628
629 void onDisableLoadElimination() override {
630 addCost(LoadEliminationCost);
631 LoadEliminationCost = 0;
632 }
633
634 bool onCallBaseVisitStart(CallBase &Call) override {
635 if (std::optional<int> AttrCallThresholdBonus =
636 getStringFnAttrAsInt(Call, "call-threshold-bonus"))
637 Threshold += *AttrCallThresholdBonus;
638
639 if (std::optional<int> AttrCallCost =
640 getStringFnAttrAsInt(Call, "call-inline-cost")) {
641 addCost(*AttrCallCost);
642 // Prevent further processing of the call since we want to override its
643 // inline cost, not just add to it.
644 return false;
645 }
646 return true;
647 }
648
649 void onCallPenalty() override { addCost(CallPenalty); }
650
651 void onMemAccess() override { addCost(MemAccessCost); }
652
653 void onCallArgumentSetup(const CallBase &Call) override {
654 // Pay the price of the argument setup. We account for the average 1
655 // instruction per call argument setup here.
656 addCost(Call.arg_size() * InstrCost);
657 }
658 void onLoadRelativeIntrinsic() override {
659 // This is normally lowered to 4 LLVM instructions.
660 addCost(3 * InstrCost);
661 }
662 void onLoweredCall(Function *F, CallBase &Call,
663 bool IsIndirectCall) override {
664 // We account for the average 1 instruction per call argument setup here.
665 addCost(Call.arg_size() * InstrCost);
666
667 // If we have a constant that we are calling as a function, we can peer
668 // through it and see the function target. This happens not infrequently
669 // during devirtualization and so we want to give it a hefty bonus for
670 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
671 // Pretend to inline the function, with a custom threshold.
672 if (IsIndirectCall && BoostIndirectCalls) {
673 auto IndirectCallParams = Params;
674 IndirectCallParams.DefaultThreshold =
676 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
677 /// to instantiate the derived class.
678 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
679 GetAssumptionCache, GetBFI, PSI, ORE, false);
680 if (CA.analyze().isSuccess()) {
681 // We were able to inline the indirect call! Subtract the cost from the
682 // threshold to get the bonus we want to apply, but don't go below zero.
683 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
684 }
685 } else
686 // Otherwise simply add the cost for merely making the call.
687 addCost(CallPenalty);
688 }
689
690 void onFinalizeSwitch(unsigned JumpTableSize,
691 unsigned NumCaseCluster) override {
692 // If suitable for a jump table, consider the cost for the table size and
693 // branch to destination.
694 // Maximum valid cost increased in this function.
695 if (JumpTableSize) {
696 int64_t JTCost =
697 static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost;
698
699 addCost(JTCost);
700 return;
701 }
702
703 if (NumCaseCluster <= 3) {
704 // Suppose a comparison includes one compare and one conditional branch.
705 addCost(NumCaseCluster * 2 * InstrCost);
706 return;
707 }
708
709 int64_t ExpectedNumberOfCompare =
710 getExpectedNumberOfCompare(NumCaseCluster);
711 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
712
713 addCost(SwitchCost);
714 }
715 void onMissedSimplification() override { addCost(InstrCost); }
716
717 void onInitializeSROAArg(AllocaInst *Arg) override {
718 assert(Arg != nullptr &&
719 "Should not initialize SROA costs for null value.");
720 SROAArgCosts[Arg] = 0;
721 }
722
723 void onAggregateSROAUse(AllocaInst *SROAArg) override {
724 auto CostIt = SROAArgCosts.find(SROAArg);
725 assert(CostIt != SROAArgCosts.end() &&
726 "expected this argument to have a cost");
727 CostIt->second += InstrCost;
728 SROACostSavings += InstrCost;
729 }
730
731 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
732
733 void onBlockAnalyzed(const BasicBlock *BB) override {
734 if (CostBenefitAnalysisEnabled) {
735 // Keep track of the static size of live but cold basic blocks. For now,
736 // we define a cold basic block to be one that's never executed.
737 assert(GetBFI && "GetBFI must be available");
738 BlockFrequencyInfo *BFI = &(GetBFI(F));
739 assert(BFI && "BFI must be available");
740 auto ProfileCount = BFI->getBlockProfileCount(BB);
741 if (*ProfileCount == 0)
742 ColdSize += Cost - CostAtBBStart;
743 }
744
745 auto *TI = BB->getTerminator();
746 // If we had any successors at this point, than post-inlining is likely to
747 // have them as well. Note that we assume any basic blocks which existed
748 // due to branches or switches which folded above will also fold after
749 // inlining.
750 if (SingleBB && TI->getNumSuccessors() > 1) {
751 // Take off the bonus we applied to the threshold.
752 Threshold -= SingleBBBonus;
753 SingleBB = false;
754 }
755 }
756
757 void onInstructionAnalysisStart(const Instruction *I) override {
758 // This function is called to store the initial cost of inlining before
759 // the given instruction was assessed.
761 return;
762 InstructionCostDetailMap[I].CostBefore = Cost;
763 InstructionCostDetailMap[I].ThresholdBefore = Threshold;
764 }
765
766 void onInstructionAnalysisFinish(const Instruction *I) override {
767 // This function is called to find new values of cost and threshold after
768 // the instruction has been assessed.
770 return;
771 InstructionCostDetailMap[I].CostAfter = Cost;
772 InstructionCostDetailMap[I].ThresholdAfter = Threshold;
773 }
774
775 bool isCostBenefitAnalysisEnabled() {
776 if (!PSI || !PSI->hasProfileSummary())
777 return false;
778
779 if (!GetBFI)
780 return false;
781
782 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
783 // Honor the explicit request from the user.
785 return false;
786 } else {
787 // Otherwise, require instrumentation profile.
788 if (!PSI->hasInstrumentationProfile())
789 return false;
790 }
791
792 auto *Caller = CandidateCall.getParent()->getParent();
793 if (!Caller->getEntryCount())
794 return false;
795
796 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
797 if (!CallerBFI)
798 return false;
799
800 // For now, limit to hot call site.
801 if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
802 return false;
803
804 // Make sure we have a nonzero entry count.
805 auto EntryCount = F.getEntryCount();
806 if (!EntryCount || !EntryCount->getCount())
807 return false;
808
809 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
810 if (!CalleeBFI)
811 return false;
812
813 return true;
814 }
815
816 // Determine whether we should inline the given call site, taking into account
817 // both the size cost and the cycle savings. Return std::nullopt if we don't
818 // have suficient profiling information to determine.
819 std::optional<bool> costBenefitAnalysis() {
820 if (!CostBenefitAnalysisEnabled)
821 return std::nullopt;
822
823 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
824 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
825 // falling back to the cost-based metric.
826 // TODO: Improve this hacky condition.
827 if (Threshold == 0)
828 return std::nullopt;
829
830 assert(GetBFI);
831 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
832 assert(CalleeBFI);
833
834 // The cycle savings expressed as the sum of InstrCost
835 // multiplied by the estimated dynamic count of each instruction we can
836 // avoid. Savings come from the call site cost, such as argument setup and
837 // the call instruction, as well as the instructions that are folded.
838 //
839 // We use 128-bit APInt here to avoid potential overflow. This variable
840 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
841 // assumes that we can avoid or fold a billion instructions, each with a
842 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
843 // period on a 4GHz machine.
844 APInt CycleSavings(128, 0);
845
846 for (auto &BB : F) {
847 APInt CurrentSavings(128, 0);
848 for (auto &I : BB) {
849 if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
850 // Count a conditional branch as savings if it becomes unconditional.
851 if (BI->isConditional() &&
852 isa_and_nonnull<ConstantInt>(
853 SimplifiedValues.lookup(BI->getCondition()))) {
854 CurrentSavings += InstrCost;
855 }
856 } else if (Value *V = dyn_cast<Value>(&I)) {
857 // Count an instruction as savings if we can fold it.
858 if (SimplifiedValues.count(V)) {
859 CurrentSavings += InstrCost;
860 }
861 }
862 }
863
864 auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
865 CurrentSavings *= *ProfileCount;
866 CycleSavings += CurrentSavings;
867 }
868
869 // Compute the cycle savings per call.
870 auto EntryProfileCount = F.getEntryCount();
871 assert(EntryProfileCount && EntryProfileCount->getCount());
872 auto EntryCount = EntryProfileCount->getCount();
873 CycleSavings += EntryCount / 2;
874 CycleSavings = CycleSavings.udiv(EntryCount);
875
876 // Compute the total savings for the call site.
877 auto *CallerBB = CandidateCall.getParent();
878 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
879 CycleSavings += getCallsiteCost(this->CandidateCall, DL);
880 CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
881
882 // Remove the cost of the cold basic blocks.
883 int Size = Cost - ColdSize;
884
885 // Allow tiny callees to be inlined regardless of whether they meet the
886 // savings threshold.
888
889 CostBenefit.emplace(APInt(128, Size), CycleSavings);
890
891 // Return true if the savings justify the cost of inlining. Specifically,
892 // we evaluate the following inequality:
893 //
894 // CycleSavings PSI->getOrCompHotCountThreshold()
895 // -------------- >= -----------------------------------
896 // Size InlineSavingsMultiplier
897 //
898 // Note that the left hand side is specific to a call site. The right hand
899 // side is a constant for the entire executable.
900 APInt LHS = CycleSavings;
902 APInt RHS(128, PSI->getOrCompHotCountThreshold());
903 RHS *= Size;
904 return LHS.uge(RHS);
905 }
906
907 InlineResult finalizeAnalysis() override {
908 // Loops generally act a lot like calls in that they act like barriers to
909 // movement, require a certain amount of setup, etc. So when optimising for
910 // size, we penalise any call sites that perform loops. We do this after all
911 // other costs here, so will likely only be dealing with relatively small
912 // functions (and hence DT and LI will hopefully be cheap).
913 auto *Caller = CandidateCall.getFunction();
914 if (Caller->hasMinSize()) {
915 DominatorTree DT(F);
916 LoopInfo LI(DT);
917 int NumLoops = 0;
918 for (Loop *L : LI) {
919 // Ignore loops that will not be executed
920 if (DeadBlocks.count(L->getHeader()))
921 continue;
922 NumLoops++;
923 }
924 addCost(NumLoops * InlineConstants::LoopPenalty);
925 }
926
927 // We applied the maximum possible vector bonus at the beginning. Now,
928 // subtract the excess bonus, if any, from the Threshold before
929 // comparing against Cost.
930 if (NumVectorInstructions <= NumInstructions / 10)
931 Threshold -= VectorBonus;
932 else if (NumVectorInstructions <= NumInstructions / 2)
933 Threshold -= VectorBonus / 2;
934
935 if (std::optional<int> AttrCost =
936 getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
937 Cost = *AttrCost;
938
939 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
940 CandidateCall,
942 Cost *= *AttrCostMult;
943
944 if (std::optional<int> AttrThreshold =
945 getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
946 Threshold = *AttrThreshold;
947
948 if (auto Result = costBenefitAnalysis()) {
949 DecidedByCostBenefit = true;
950 if (*Result)
951 return InlineResult::success();
952 else
953 return InlineResult::failure("Cost over threshold.");
954 }
955
956 if (IgnoreThreshold)
957 return InlineResult::success();
958
959 DecidedByCostThreshold = true;
960 return Cost < std::max(1, Threshold)
962 : InlineResult::failure("Cost over threshold.");
963 }
964
965 bool shouldStop() override {
966 if (IgnoreThreshold || ComputeFullInlineCost)
967 return false;
968 // Bail out the moment we cross the threshold. This means we'll under-count
969 // the cost, but only when undercounting doesn't matter.
970 if (Cost < Threshold)
971 return false;
972 DecidedByCostThreshold = true;
973 return true;
974 }
975
976 void onLoadEliminationOpportunity() override {
977 LoadEliminationCost += InstrCost;
978 }
979
980 InlineResult onAnalysisStart() override {
981 // Perform some tweaks to the cost and threshold based on the direct
982 // callsite information.
983
984 // We want to more aggressively inline vector-dense kernels, so up the
985 // threshold, and we'll lower it if the % of vector instructions gets too
986 // low. Note that these bonuses are some what arbitrary and evolved over
987 // time by accident as much as because they are principled bonuses.
988 //
989 // FIXME: It would be nice to remove all such bonuses. At least it would be
990 // nice to base the bonus values on something more scientific.
991 assert(NumInstructions == 0);
992 assert(NumVectorInstructions == 0);
993
994 // Update the threshold based on callsite properties
995 updateThreshold(CandidateCall, F);
996
997 // While Threshold depends on commandline options that can take negative
998 // values, we want to enforce the invariant that the computed threshold and
999 // bonuses are non-negative.
1000 assert(Threshold >= 0);
1001 assert(SingleBBBonus >= 0);
1002 assert(VectorBonus >= 0);
1003
1004 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1005 // this Threshold any time, and cost cannot decrease, we can stop processing
1006 // the rest of the function body.
1007 Threshold += (SingleBBBonus + VectorBonus);
1008
1009 // Give out bonuses for the callsite, as the instructions setting them up
1010 // will be gone after inlining.
1011 addCost(-getCallsiteCost(this->CandidateCall, DL));
1012
1013 // If this function uses the coldcc calling convention, prefer not to inline
1014 // it.
1015 if (F.getCallingConv() == CallingConv::Cold)
1017
1018 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1019
1020 // Check if we're done. This can happen due to bonuses and penalties.
1021 if (Cost >= Threshold && !ComputeFullInlineCost)
1022 return InlineResult::failure("high cost");
1023
1024 return InlineResult::success();
1025 }
1026
1027public:
1028 InlineCostCallAnalyzer(
1029 Function &Callee, CallBase &Call, const InlineParams &Params,
1030 const TargetTransformInfo &TTI,
1031 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1032 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1033 ProfileSummaryInfo *PSI = nullptr,
1034 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1035 bool IgnoreThreshold = false)
1036 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1037 ComputeFullInlineCost(OptComputeFullInlineCost ||
1038 Params.ComputeFullInlineCost || ORE ||
1039 isCostBenefitAnalysisEnabled()),
1040 Params(Params), Threshold(Params.DefaultThreshold),
1041 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1042 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1043 Writer(this) {
1044 AllowRecursiveCall = *Params.AllowRecursiveCall;
1045 }
1046
1047 /// Annotation Writer for instruction details
1048 InlineCostAnnotationWriter Writer;
1049
1050 void dump();
1051
1052 // Prints the same analysis as dump(), but its definition is not dependent
1053 // on the build.
1054 void print(raw_ostream &OS);
1055
1056 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1057 if (InstructionCostDetailMap.contains(I))
1058 return InstructionCostDetailMap[I];
1059 return std::nullopt;
1060 }
1061
1062 virtual ~InlineCostCallAnalyzer() = default;
1063 int getThreshold() const { return Threshold; }
1064 int getCost() const { return Cost; }
1065 int getStaticBonusApplied() const { return StaticBonusApplied; }
1066 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1067 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1068 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1069};
1070
1071// Return true if CB is the sole call to local function Callee.
1072static bool isSoleCallToLocalFunction(const CallBase &CB,
1073 const Function &Callee) {
1074 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1075 &Callee == CB.getCalledFunction();
1076}
1077
1078class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1079private:
1081
1082 // FIXME: These constants are taken from the heuristic-based cost visitor.
1083 // These should be removed entirely in a later revision to avoid reliance on
1084 // heuristics in the ML inliner.
1085 static constexpr int JTCostMultiplier = 4;
1086 static constexpr int CaseClusterCostMultiplier = 2;
1087 static constexpr int SwitchCostMultiplier = 2;
1088
1089 // FIXME: These are taken from the heuristic-based cost visitor: we should
1090 // eventually abstract these to the CallAnalyzer to avoid duplication.
1091 unsigned SROACostSavingOpportunities = 0;
1092 int VectorBonus = 0;
1093 int SingleBBBonus = 0;
1094 int Threshold = 5;
1095
1097
1098 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1099 Cost[static_cast<size_t>(Feature)] += Delta;
1100 }
1101
1102 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1103 Cost[static_cast<size_t>(Feature)] = Value;
1104 }
1105
1106 void onDisableSROA(AllocaInst *Arg) override {
1107 auto CostIt = SROACosts.find(Arg);
1108 if (CostIt == SROACosts.end())
1109 return;
1110
1111 increment(InlineCostFeatureIndex::SROALosses, CostIt->second);
1112 SROACostSavingOpportunities -= CostIt->second;
1113 SROACosts.erase(CostIt);
1114 }
1115
1116 void onDisableLoadElimination() override {
1117 set(InlineCostFeatureIndex::LoadElimination, 1);
1118 }
1119
1120 void onCallPenalty() override {
1121 increment(InlineCostFeatureIndex::CallPenalty, CallPenalty);
1122 }
1123
1124 void onCallArgumentSetup(const CallBase &Call) override {
1125 increment(InlineCostFeatureIndex::CallArgumentSetup,
1126 Call.arg_size() * InstrCost);
1127 }
1128
1129 void onLoadRelativeIntrinsic() override {
1130 increment(InlineCostFeatureIndex::LoadRelativeIntrinsic, 3 * InstrCost);
1131 }
1132
1133 void onLoweredCall(Function *F, CallBase &Call,
1134 bool IsIndirectCall) override {
1135 increment(InlineCostFeatureIndex::LoweredCallArgSetup,
1136 Call.arg_size() * InstrCost);
1137
1138 if (IsIndirectCall) {
1139 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1140 /*HintThreshold*/ {},
1141 /*ColdThreshold*/ {},
1142 /*OptSizeThreshold*/ {},
1143 /*OptMinSizeThreshold*/ {},
1144 /*HotCallSiteThreshold*/ {},
1145 /*LocallyHotCallSiteThreshold*/ {},
1146 /*ColdCallSiteThreshold*/ {},
1147 /*ComputeFullInlineCost*/ true,
1148 /*EnableDeferral*/ true};
1149 IndirectCallParams.DefaultThreshold =
1151
1152 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1153 GetAssumptionCache, GetBFI, PSI, ORE, false,
1154 true);
1155 if (CA.analyze().isSuccess()) {
1156 increment(InlineCostFeatureIndex::NestedInlineCostEstimate,
1157 CA.getCost());
1158 increment(InlineCostFeatureIndex::NestedInlines, 1);
1159 }
1160 } else {
1161 onCallPenalty();
1162 }
1163 }
1164
1165 void onFinalizeSwitch(unsigned JumpTableSize,
1166 unsigned NumCaseCluster) override {
1167
1168 if (JumpTableSize) {
1169 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1170 JTCostMultiplier * InstrCost;
1171 increment(InlineCostFeatureIndex::JumpTablePenalty, JTCost);
1172 return;
1173 }
1174
1175 if (NumCaseCluster <= 3) {
1176 increment(InlineCostFeatureIndex::CaseClusterPenalty,
1177 NumCaseCluster * CaseClusterCostMultiplier * InstrCost);
1178 return;
1179 }
1180
1181 int64_t ExpectedNumberOfCompare =
1182 getExpectedNumberOfCompare(NumCaseCluster);
1183
1184 int64_t SwitchCost =
1185 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1186 increment(InlineCostFeatureIndex::SwitchPenalty, SwitchCost);
1187 }
1188
1189 void onMissedSimplification() override {
1190 increment(InlineCostFeatureIndex::UnsimplifiedCommonInstructions,
1191 InstrCost);
1192 }
1193
1194 void onInitializeSROAArg(AllocaInst *Arg) override { SROACosts[Arg] = 0; }
1195 void onAggregateSROAUse(AllocaInst *Arg) override {
1196 SROACosts.find(Arg)->second += InstrCost;
1197 SROACostSavingOpportunities += InstrCost;
1198 }
1199
1200 void onBlockAnalyzed(const BasicBlock *BB) override {
1201 if (BB->getTerminator()->getNumSuccessors() > 1)
1202 set(InlineCostFeatureIndex::IsMultipleBlocks, 1);
1203 Threshold -= SingleBBBonus;
1204 }
1205
1206 InlineResult finalizeAnalysis() override {
1207 auto *Caller = CandidateCall.getFunction();
1208 if (Caller->hasMinSize()) {
1209 DominatorTree DT(F);
1210 LoopInfo LI(DT);
1211 for (Loop *L : LI) {
1212 // Ignore loops that will not be executed
1213 if (DeadBlocks.count(L->getHeader()))
1214 continue;
1215 increment(InlineCostFeatureIndex::NumLoops,
1217 }
1218 }
1219 set(InlineCostFeatureIndex::DeadBlocks, DeadBlocks.size());
1220 set(InlineCostFeatureIndex::SimplifiedInstructions,
1221 NumInstructionsSimplified);
1222 set(InlineCostFeatureIndex::ConstantArgs, NumConstantArgs);
1223 set(InlineCostFeatureIndex::ConstantOffsetPtrArgs,
1224 NumConstantOffsetPtrArgs);
1225 set(InlineCostFeatureIndex::SROASavings, SROACostSavingOpportunities);
1226
1227 if (NumVectorInstructions <= NumInstructions / 10)
1228 Threshold -= VectorBonus;
1229 else if (NumVectorInstructions <= NumInstructions / 2)
1230 Threshold -= VectorBonus / 2;
1231
1232 set(InlineCostFeatureIndex::Threshold, Threshold);
1233
1234 return InlineResult::success();
1235 }
1236
1237 bool shouldStop() override { return false; }
1238
1239 void onLoadEliminationOpportunity() override {
1240 increment(InlineCostFeatureIndex::LoadElimination, 1);
1241 }
1242
1243 InlineResult onAnalysisStart() override {
1244 increment(InlineCostFeatureIndex::CallSiteCost,
1245 -1 * getCallsiteCost(this->CandidateCall, DL));
1246
1247 set(InlineCostFeatureIndex::ColdCcPenalty,
1248 (F.getCallingConv() == CallingConv::Cold));
1249
1250 set(InlineCostFeatureIndex::LastCallToStaticBonus,
1251 isSoleCallToLocalFunction(CandidateCall, F));
1252
1253 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1254 // analyzer - instead, we should abstract it to a common method in the
1255 // CallAnalyzer
1256 int SingleBBBonusPercent = 50;
1257 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1258 Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1259 Threshold *= TTI.getInliningThresholdMultiplier();
1260 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1261 VectorBonus = Threshold * VectorBonusPercent / 100;
1262 Threshold += (SingleBBBonus + VectorBonus);
1263
1264 return InlineResult::success();
1265 }
1266
1267public:
1268 InlineCostFeaturesAnalyzer(
1269 const TargetTransformInfo &TTI,
1270 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1273 CallBase &Call)
1274 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1275
1276 const InlineCostFeatures &features() const { return Cost; }
1277};
1278
1279} // namespace
1280
1281/// Test whether the given value is an Alloca-derived function argument.
1282bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1283 return SROAArgValues.count(V);
1284}
1285
1286void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1287 onDisableSROA(SROAArg);
1288 EnabledSROAAllocas.erase(SROAArg);
1289 disableLoadElimination();
1290}
1291
1292void InlineCostAnnotationWriter::emitInstructionAnnot(
1294 // The cost of inlining of the given instruction is printed always.
1295 // The threshold delta is printed only when it is non-zero. It happens
1296 // when we decided to give a bonus at a particular instruction.
1297 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1298 if (!Record)
1299 OS << "; No analysis for the instruction";
1300 else {
1301 OS << "; cost before = " << Record->CostBefore
1302 << ", cost after = " << Record->CostAfter
1303 << ", threshold before = " << Record->ThresholdBefore
1304 << ", threshold after = " << Record->ThresholdAfter << ", ";
1305 OS << "cost delta = " << Record->getCostDelta();
1306 if (Record->hasThresholdChanged())
1307 OS << ", threshold delta = " << Record->getThresholdDelta();
1308 }
1309 auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1310 if (C) {
1311 OS << ", simplified to ";
1312 (*C)->print(OS, true);
1313 }
1314 OS << "\n";
1315}
1316
1317/// If 'V' maps to a SROA candidate, disable SROA for it.
1318void CallAnalyzer::disableSROA(Value *V) {
1319 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1320 disableSROAForArg(SROAArg);
1321 }
1322}
1323
1324void CallAnalyzer::disableLoadElimination() {
1325 if (EnableLoadElimination) {
1326 onDisableLoadElimination();
1327 EnableLoadElimination = false;
1328 }
1329}
1330
1331/// Accumulate a constant GEP offset into an APInt if possible.
1332///
1333/// Returns false if unable to compute the offset for any reason. Respects any
1334/// simplified values known during the analysis of this callsite.
1335bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1336 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1337 assert(IntPtrWidth == Offset.getBitWidth());
1338
1340 GTI != GTE; ++GTI) {
1341 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1342 if (!OpC)
1343 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
1344 OpC = dyn_cast<ConstantInt>(SimpleOp);
1345 if (!OpC)
1346 return false;
1347 if (OpC->isZero())
1348 continue;
1349
1350 // Handle a struct index, which adds its field offset to the pointer.
1351 if (StructType *STy = GTI.getStructTypeOrNull()) {
1352 unsigned ElementIdx = OpC->getZExtValue();
1353 const StructLayout *SL = DL.getStructLayout(STy);
1354 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1355 continue;
1356 }
1357
1358 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
1359 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1360 }
1361 return true;
1362}
1363
1364/// Use TTI to check whether a GEP is free.
1365///
1366/// Respects any simplified values known during the analysis of this callsite.
1367bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1369 Operands.push_back(GEP.getOperand(0));
1370 for (const Use &Op : GEP.indices())
1371 if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1372 Operands.push_back(SimpleOp);
1373 else
1374 Operands.push_back(Op);
1378}
1379
1380bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1381 disableSROA(I.getOperand(0));
1382
1383 // Check whether inlining will turn a dynamic alloca into a static
1384 // alloca and handle that case.
1385 if (I.isArrayAllocation()) {
1386 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1387 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1388 // Sometimes a dynamic alloca could be converted into a static alloca
1389 // after this constant prop, and become a huge static alloca on an
1390 // unconditional CFG path. Avoid inlining if this is going to happen above
1391 // a threshold.
1392 // FIXME: If the threshold is removed or lowered too much, we could end up
1393 // being too pessimistic and prevent inlining non-problematic code. This
1394 // could result in unintended perf regressions. A better overall strategy
1395 // is needed to track stack usage during inlining.
1396 Type *Ty = I.getAllocatedType();
1397 AllocatedSize = SaturatingMultiplyAdd(
1398 AllocSize->getLimitedValue(),
1399 DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1401 HasDynamicAlloca = true;
1402 return false;
1403 }
1404 }
1405
1406 // Accumulate the allocated size.
1407 if (I.isStaticAlloca()) {
1408 Type *Ty = I.getAllocatedType();
1409 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1410 AllocatedSize);
1411 }
1412
1413 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1414 // a variety of reasons, and so we would like to not inline them into
1415 // functions which don't currently have a dynamic alloca. This simply
1416 // disables inlining altogether in the presence of a dynamic alloca.
1417 if (!I.isStaticAlloca())
1418 HasDynamicAlloca = true;
1419
1420 return false;
1421}
1422
1423bool CallAnalyzer::visitPHI(PHINode &I) {
1424 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1425 // though we don't want to propagate it's bonuses. The idea is to disable
1426 // SROA if it *might* be used in an inappropriate manner.
1427
1428 // Phi nodes are always zero-cost.
1429 // FIXME: Pointer sizes may differ between different address spaces, so do we
1430 // need to use correct address space in the call to getPointerSizeInBits here?
1431 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1432 // see the ZeroOffset is used as a dummy value, so we can probably use any
1433 // bit width for the ZeroOffset?
1434 APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1435 bool CheckSROA = I.getType()->isPointerTy();
1436
1437 // Track the constant or pointer with constant offset we've seen so far.
1438 Constant *FirstC = nullptr;
1439 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1440 Value *FirstV = nullptr;
1441
1442 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1443 BasicBlock *Pred = I.getIncomingBlock(i);
1444 // If the incoming block is dead, skip the incoming block.
1445 if (DeadBlocks.count(Pred))
1446 continue;
1447 // If the parent block of phi is not the known successor of the incoming
1448 // block, skip the incoming block.
1449 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1450 if (KnownSuccessor && KnownSuccessor != I.getParent())
1451 continue;
1452
1453 Value *V = I.getIncomingValue(i);
1454 // If the incoming value is this phi itself, skip the incoming value.
1455 if (&I == V)
1456 continue;
1457
1458 Constant *C = dyn_cast<Constant>(V);
1459 if (!C)
1460 C = SimplifiedValues.lookup(V);
1461
1462 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1463 if (!C && CheckSROA)
1464 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1465
1466 if (!C && !BaseAndOffset.first)
1467 // The incoming value is neither a constant nor a pointer with constant
1468 // offset, exit early.
1469 return true;
1470
1471 if (FirstC) {
1472 if (FirstC == C)
1473 // If we've seen a constant incoming value before and it is the same
1474 // constant we see this time, continue checking the next incoming value.
1475 continue;
1476 // Otherwise early exit because we either see a different constant or saw
1477 // a constant before but we have a pointer with constant offset this time.
1478 return true;
1479 }
1480
1481 if (FirstV) {
1482 // The same logic as above, but check pointer with constant offset here.
1483 if (FirstBaseAndOffset == BaseAndOffset)
1484 continue;
1485 return true;
1486 }
1487
1488 if (C) {
1489 // This is the 1st time we've seen a constant, record it.
1490 FirstC = C;
1491 continue;
1492 }
1493
1494 // The remaining case is that this is the 1st time we've seen a pointer with
1495 // constant offset, record it.
1496 FirstV = V;
1497 FirstBaseAndOffset = BaseAndOffset;
1498 }
1499
1500 // Check if we can map phi to a constant.
1501 if (FirstC) {
1502 SimplifiedValues[&I] = FirstC;
1503 return true;
1504 }
1505
1506 // Check if we can map phi to a pointer with constant offset.
1507 if (FirstBaseAndOffset.first) {
1508 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1509
1510 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1511 SROAArgValues[&I] = SROAArg;
1512 }
1513
1514 return true;
1515}
1516
1517/// Check we can fold GEPs of constant-offset call site argument pointers.
1518/// This requires target data and inbounds GEPs.
1519///
1520/// \return true if the specified GEP can be folded.
1521bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1522 // Check if we have a base + offset for the pointer.
1523 std::pair<Value *, APInt> BaseAndOffset =
1524 ConstantOffsetPtrs.lookup(I.getPointerOperand());
1525 if (!BaseAndOffset.first)
1526 return false;
1527
1528 // Check if the offset of this GEP is constant, and if so accumulate it
1529 // into Offset.
1530 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1531 return false;
1532
1533 // Add the result as a new mapping to Base + Offset.
1534 ConstantOffsetPtrs[&I] = BaseAndOffset;
1535
1536 return true;
1537}
1538
1539bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1540 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1541
1542 // Lambda to check whether a GEP's indices are all constant.
1543 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1544 for (const Use &Op : GEP.indices())
1545 if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
1546 return false;
1547 return true;
1548 };
1549
1552 return true;
1553
1554 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1555 if (SROAArg)
1556 SROAArgValues[&I] = SROAArg;
1557
1558 // Constant GEPs are modeled as free.
1559 return true;
1560 }
1561
1562 // Variable GEPs will require math and will disable SROA.
1563 if (SROAArg)
1564 disableSROAForArg(SROAArg);
1565 return isGEPFree(I);
1566}
1567
1568/// Simplify \p I if its operands are constants and update SimplifiedValues.
1569bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1571 for (Value *Op : I.operands()) {
1572 Constant *COp = dyn_cast<Constant>(Op);
1573 if (!COp)
1574 COp = SimplifiedValues.lookup(Op);
1575 if (!COp)
1576 return false;
1577 COps.push_back(COp);
1578 }
1579 auto *C = ConstantFoldInstOperands(&I, COps, DL);
1580 if (!C)
1581 return false;
1582 SimplifiedValues[&I] = C;
1583 return true;
1584}
1585
1586/// Try to simplify a call to llvm.is.constant.
1587///
1588/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1589/// we expect calls of this specific intrinsic to be infrequent.
1590///
1591/// FIXME: Given that we know CB's parent (F) caller
1592/// (CandidateCall->getParent()->getParent()), we might be able to determine
1593/// whether inlining F into F's caller would change how the call to
1594/// llvm.is.constant would evaluate.
1595bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1596 Value *Arg = CB.getArgOperand(0);
1597 auto *C = dyn_cast<Constant>(Arg);
1598
1599 if (!C)
1600 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
1601
1602 Type *RT = CB.getFunctionType()->getReturnType();
1603 SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1604 return true;
1605}
1606
1607bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1608 // As per the langref, "The fourth argument to llvm.objectsize determines if
1609 // the value should be evaluated at runtime."
1610 if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1611 return false;
1612
1613 Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1614 /*MustSucceed=*/true);
1615 Constant *C = dyn_cast_or_null<Constant>(V);
1616 if (C)
1617 SimplifiedValues[&CB] = C;
1618 return C;
1619}
1620
1621bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1622 // Propagate constants through bitcasts.
1624 return true;
1625
1626 // Track base/offsets through casts
1627 std::pair<Value *, APInt> BaseAndOffset =
1628 ConstantOffsetPtrs.lookup(I.getOperand(0));
1629 // Casts don't change the offset, just wrap it up.
1630 if (BaseAndOffset.first)
1631 ConstantOffsetPtrs[&I] = BaseAndOffset;
1632
1633 // Also look for SROA candidates here.
1634 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1635 SROAArgValues[&I] = SROAArg;
1636
1637 // Bitcasts are always zero cost.
1638 return true;
1639}
1640
1641bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1642 // Propagate constants through ptrtoint.
1644 return true;
1645
1646 // Track base/offset pairs when converted to a plain integer provided the
1647 // integer is large enough to represent the pointer.
1648 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1649 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1650 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1651 std::pair<Value *, APInt> BaseAndOffset =
1652 ConstantOffsetPtrs.lookup(I.getOperand(0));
1653 if (BaseAndOffset.first)
1654 ConstantOffsetPtrs[&I] = BaseAndOffset;
1655 }
1656
1657 // This is really weird. Technically, ptrtoint will disable SROA. However,
1658 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1659 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1660 // would block SROA would also block SROA if applied directly to a pointer,
1661 // and so we can just add the integer in here. The only places where SROA is
1662 // preserved either cannot fire on an integer, or won't in-and-of themselves
1663 // disable SROA (ext) w/o some later use that we would see and disable.
1664 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1665 SROAArgValues[&I] = SROAArg;
1666
1669}
1670
1671bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1672 // Propagate constants through ptrtoint.
1674 return true;
1675
1676 // Track base/offset pairs when round-tripped through a pointer without
1677 // modifications provided the integer is not too large.
1678 Value *Op = I.getOperand(0);
1679 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1680 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1681 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1682 if (BaseAndOffset.first)
1683 ConstantOffsetPtrs[&I] = BaseAndOffset;
1684 }
1685
1686 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1687 if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1688 SROAArgValues[&I] = SROAArg;
1689
1692}
1693
1694bool CallAnalyzer::visitCastInst(CastInst &I) {
1695 // Propagate constants through casts.
1697 return true;
1698
1699 // Disable SROA in the face of arbitrary casts we don't explicitly list
1700 // elsewhere.
1701 disableSROA(I.getOperand(0));
1702
1703 // If this is a floating-point cast, and the target says this operation
1704 // is expensive, this may eventually become a library call. Treat the cost
1705 // as such.
1706 switch (I.getOpcode()) {
1707 case Instruction::FPTrunc:
1708 case Instruction::FPExt:
1709 case Instruction::UIToFP:
1710 case Instruction::SIToFP:
1711 case Instruction::FPToUI:
1712 case Instruction::FPToSI:
1714 onCallPenalty();
1715 break;
1716 default:
1717 break;
1718 }
1719
1722}
1723
1724bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1725 return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1726}
1727
1728bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1729 // Does the *call site* have the NonNull attribute set on an argument? We
1730 // use the attribute on the call site to memoize any analysis done in the
1731 // caller. This will also trip if the callee function has a non-null
1732 // parameter attribute, but that's a less interesting case because hopefully
1733 // the callee would already have been simplified based on that.
1734 if (Argument *A = dyn_cast<Argument>(V))
1735 if (paramHasAttr(A, Attribute::NonNull))
1736 return true;
1737
1738 // Is this an alloca in the caller? This is distinct from the attribute case
1739 // above because attributes aren't updated within the inliner itself and we
1740 // always want to catch the alloca derived case.
1741 if (isAllocaDerivedArg(V))
1742 // We can actually predict the result of comparisons between an
1743 // alloca-derived value and null. Note that this fires regardless of
1744 // SROA firing.
1745 return true;
1746
1747 return false;
1748}
1749
1750bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1751 // If the normal destination of the invoke or the parent block of the call
1752 // site is unreachable-terminated, there is little point in inlining this
1753 // unless there is literally zero cost.
1754 // FIXME: Note that it is possible that an unreachable-terminated block has a
1755 // hot entry. For example, in below scenario inlining hot_call_X() may be
1756 // beneficial :
1757 // main() {
1758 // hot_call_1();
1759 // ...
1760 // hot_call_N()
1761 // exit(0);
1762 // }
1763 // For now, we are not handling this corner case here as it is rare in real
1764 // code. In future, we should elaborate this based on BPI and BFI in more
1765 // general threshold adjusting heuristics in updateThreshold().
1766 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1767 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1768 return false;
1769 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1770 return false;
1771
1772 return true;
1773}
1774
1775bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1776 BlockFrequencyInfo *CallerBFI) {
1777 // If global profile summary is available, then callsite's coldness is
1778 // determined based on that.
1779 if (PSI && PSI->hasProfileSummary())
1780 return PSI->isColdCallSite(Call, CallerBFI);
1781
1782 // Otherwise we need BFI to be available.
1783 if (!CallerBFI)
1784 return false;
1785
1786 // Determine if the callsite is cold relative to caller's entry. We could
1787 // potentially cache the computation of scaled entry frequency, but the added
1788 // complexity is not worth it unless this scaling shows up high in the
1789 // profiles.
1790 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1791 auto CallSiteBB = Call.getParent();
1792 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1793 auto CallerEntryFreq =
1794 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1795 return CallSiteFreq < CallerEntryFreq * ColdProb;
1796}
1797
1798std::optional<int>
1799InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1800 BlockFrequencyInfo *CallerBFI) {
1801
1802 // If global profile summary is available, then callsite's hotness is
1803 // determined based on that.
1804 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1805 return Params.HotCallSiteThreshold;
1806
1807 // Otherwise we need BFI to be available and to have a locally hot callsite
1808 // threshold.
1809 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1810 return std::nullopt;
1811
1812 // Determine if the callsite is hot relative to caller's entry. We could
1813 // potentially cache the computation of scaled entry frequency, but the added
1814 // complexity is not worth it unless this scaling shows up high in the
1815 // profiles.
1816 auto CallSiteBB = Call.getParent();
1817 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
1818 auto CallerEntryFreq = CallerBFI->getEntryFreq();
1819 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
1820 return Params.LocallyHotCallSiteThreshold;
1821
1822 // Otherwise treat it normally.
1823 return std::nullopt;
1824}
1825
1826void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1827 // If no size growth is allowed for this inlining, set Threshold to 0.
1828 if (!allowSizeGrowth(Call)) {
1829 Threshold = 0;
1830 return;
1831 }
1832
1833 Function *Caller = Call.getCaller();
1834
1835 // return min(A, B) if B is valid.
1836 auto MinIfValid = [](int A, std::optional<int> B) {
1837 return B ? std::min(A, *B) : A;
1838 };
1839
1840 // return max(A, B) if B is valid.
1841 auto MaxIfValid = [](int A, std::optional<int> B) {
1842 return B ? std::max(A, *B) : A;
1843 };
1844
1845 // Various bonus percentages. These are multiplied by Threshold to get the
1846 // bonus values.
1847 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1848 // basic block at the given callsite context. This is speculatively applied
1849 // and withdrawn if more than one basic block is seen.
1850 //
1851 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1852 // of the last call to a static function as inlining such functions is
1853 // guaranteed to reduce code size.
1854 //
1855 // These bonus percentages may be set to 0 based on properties of the caller
1856 // and the callsite.
1857 int SingleBBBonusPercent = 50;
1858 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1860
1861 // Lambda to set all the above bonus and bonus percentages to 0.
1862 auto DisallowAllBonuses = [&]() {
1863 SingleBBBonusPercent = 0;
1864 VectorBonusPercent = 0;
1866 };
1867
1868 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1869 // and reduce the threshold if the caller has the necessary attribute.
1870 if (Caller->hasMinSize()) {
1871 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1872 // For minsize, we want to disable the single BB bonus and the vector
1873 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1874 // a static function will, at the minimum, eliminate the parameter setup and
1875 // call/return instructions.
1876 SingleBBBonusPercent = 0;
1877 VectorBonusPercent = 0;
1878 } else if (Caller->hasOptSize())
1879 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1880
1881 // Adjust the threshold based on inlinehint attribute and profile based
1882 // hotness information if the caller does not have MinSize attribute.
1883 if (!Caller->hasMinSize()) {
1884 if (Callee.hasFnAttribute(Attribute::InlineHint))
1885 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1886
1887 // FIXME: After switching to the new passmanager, simplify the logic below
1888 // by checking only the callsite hotness/coldness as we will reliably
1889 // have local profile information.
1890 //
1891 // Callsite hotness and coldness can be determined if sample profile is
1892 // used (which adds hotness metadata to calls) or if caller's
1893 // BlockFrequencyInfo is available.
1894 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1895 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1896 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1897 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1898 // FIXME: This should update the threshold only if it exceeds the
1899 // current threshold, but AutoFDO + ThinLTO currently relies on this
1900 // behavior to prevent inlining of hot callsites during ThinLTO
1901 // compile phase.
1902 Threshold = *HotCallSiteThreshold;
1903 } else if (isColdCallSite(Call, CallerBFI)) {
1904 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1905 // Do not apply bonuses for a cold callsite including the
1906 // LastCallToStatic bonus. While this bonus might result in code size
1907 // reduction, it can cause the size of a non-cold caller to increase
1908 // preventing it from being inlined.
1909 DisallowAllBonuses();
1910 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1911 } else if (PSI) {
1912 // Use callee's global profile information only if we have no way of
1913 // determining this via callsite information.
1914 if (PSI->isFunctionEntryHot(&Callee)) {
1915 LLVM_DEBUG(dbgs() << "Hot callee.\n");
1916 // If callsite hotness can not be determined, we may still know
1917 // that the callee is hot and treat it as a weaker hint for threshold
1918 // increase.
1919 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1920 } else if (PSI->isFunctionEntryCold(&Callee)) {
1921 LLVM_DEBUG(dbgs() << "Cold callee.\n");
1922 // Do not apply bonuses for a cold callee including the
1923 // LastCallToStatic bonus. While this bonus might result in code size
1924 // reduction, it can cause the size of a non-cold caller to increase
1925 // preventing it from being inlined.
1926 DisallowAllBonuses();
1927 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
1928 }
1929 }
1930 }
1931
1932 Threshold += TTI.adjustInliningThreshold(&Call);
1933
1934 // Finally, take the target-specific inlining threshold multiplier into
1935 // account.
1936 Threshold *= TTI.getInliningThresholdMultiplier();
1937
1938 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1939 VectorBonus = Threshold * VectorBonusPercent / 100;
1940
1941 // If there is only one call of the function, and it has internal linkage,
1942 // the cost of inlining it drops dramatically. It may seem odd to update
1943 // Cost in updateThreshold, but the bonus depends on the logic in this method.
1944 if (isSoleCallToLocalFunction(Call, F)) {
1946 StaticBonusApplied = LastCallToStaticBonus;
1947 }
1948}
1949
1950bool CallAnalyzer::visitCmpInst(CmpInst &I) {
1951 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1952 // First try to handle simplified comparisons.
1954 return true;
1955
1956 if (I.getOpcode() == Instruction::FCmp)
1957 return false;
1958
1959 // Otherwise look for a comparison between constant offset pointers with
1960 // a common base.
1961 Value *LHSBase, *RHSBase;
1962 APInt LHSOffset, RHSOffset;
1963 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1964 if (LHSBase) {
1965 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1966 if (RHSBase && LHSBase == RHSBase) {
1967 // We have common bases, fold the icmp to a constant based on the
1968 // offsets.
1969 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1970 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1971 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1972 SimplifiedValues[&I] = C;
1973 ++NumConstantPtrCmps;
1974 return true;
1975 }
1976 }
1977 }
1978
1979 // If the comparison is an equality comparison with null, we can simplify it
1980 // if we know the value (argument) can't be null
1981 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1982 isKnownNonNullInCallee(I.getOperand(0))) {
1983 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1984 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1986 return true;
1987 }
1988 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
1989}
1990
1991bool CallAnalyzer::visitSub(BinaryOperator &I) {
1992 // Try to handle a special case: we can fold computing the difference of two
1993 // constant-related pointers.
1994 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1995 Value *LHSBase, *RHSBase;
1996 APInt LHSOffset, RHSOffset;
1997 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1998 if (LHSBase) {
1999 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2000 if (RHSBase && LHSBase == RHSBase) {
2001 // We have common bases, fold the subtract to a constant based on the
2002 // offsets.
2003 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2004 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2005 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2006 SimplifiedValues[&I] = C;
2007 ++NumConstantPtrDiffs;
2008 return true;
2009 }
2010 }
2011 }
2012
2013 // Otherwise, fall back to the generic logic for simplifying and handling
2014 // instructions.
2015 return Base::visitSub(I);
2016}
2017
2018bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2019 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2020 Constant *CLHS = dyn_cast<Constant>(LHS);
2021 if (!CLHS)
2022 CLHS = SimplifiedValues.lookup(LHS);
2023 Constant *CRHS = dyn_cast<Constant>(RHS);
2024 if (!CRHS)
2025 CRHS = SimplifiedValues.lookup(RHS);
2026
2027 Value *SimpleV = nullptr;
2028 if (auto FI = dyn_cast<FPMathOperator>(&I))
2029 SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2030 FI->getFastMathFlags(), DL);
2031 else
2032 SimpleV =
2033 simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2034
2035 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2036 SimplifiedValues[&I] = C;
2037
2038 if (SimpleV)
2039 return true;
2040
2041 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2042 disableSROA(LHS);
2043 disableSROA(RHS);
2044
2045 // If the instruction is floating point, and the target says this operation
2046 // is expensive, this may eventually become a library call. Treat the cost
2047 // as such. Unless it's fneg which can be implemented with an xor.
2048 using namespace llvm::PatternMatch;
2049 if (I.getType()->isFloatingPointTy() &&
2051 !match(&I, m_FNeg(m_Value())))
2052 onCallPenalty();
2053
2054 return false;
2055}
2056
2057bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2058 Value *Op = I.getOperand(0);
2059 Constant *COp = dyn_cast<Constant>(Op);
2060 if (!COp)
2061 COp = SimplifiedValues.lookup(Op);
2062
2063 Value *SimpleV = simplifyFNegInst(
2064 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2065
2066 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2067 SimplifiedValues[&I] = C;
2068
2069 if (SimpleV)
2070 return true;
2071
2072 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2073 disableSROA(Op);
2074
2075 return false;
2076}
2077
2078bool CallAnalyzer::visitLoad(LoadInst &I) {
2079 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2080 return true;
2081
2082 // If the data is already loaded from this address and hasn't been clobbered
2083 // by any stores or calls, this load is likely to be redundant and can be
2084 // eliminated.
2085 if (EnableLoadElimination &&
2086 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2087 onLoadEliminationOpportunity();
2088 return true;
2089 }
2090
2091 onMemAccess();
2092 return false;
2093}
2094
2095bool CallAnalyzer::visitStore(StoreInst &I) {
2096 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2097 return true;
2098
2099 // The store can potentially clobber loads and prevent repeated loads from
2100 // being eliminated.
2101 // FIXME:
2102 // 1. We can probably keep an initial set of eliminatable loads substracted
2103 // from the cost even when we finally see a store. We just need to disable
2104 // *further* accumulation of elimination savings.
2105 // 2. We should probably at some point thread MemorySSA for the callee into
2106 // this and then use that to actually compute *really* precise savings.
2107 disableLoadElimination();
2108
2109 onMemAccess();
2110 return false;
2111}
2112
2113bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2114 // Constant folding for extract value is trivial.
2116 return true;
2117
2118 // SROA can't look through these, but they may be free.
2119 return Base::visitExtractValue(I);
2120}
2121
2122bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2123 // Constant folding for insert value is trivial.
2125 return true;
2126
2127 // SROA can't look through these, but they may be free.
2128 return Base::visitInsertValue(I);
2129}
2130
2131/// Try to simplify a call site.
2132///
2133/// Takes a concrete function and callsite and tries to actually simplify it by
2134/// analyzing the arguments and call itself with instsimplify. Returns true if
2135/// it has simplified the callsite to some other entity (a constant), making it
2136/// free.
2137bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2138 // FIXME: Using the instsimplify logic directly for this is inefficient
2139 // because we have to continually rebuild the argument list even when no
2140 // simplifications can be performed. Until that is fixed with remapping
2141 // inside of instsimplify, directly constant fold calls here.
2142 if (!canConstantFoldCallTo(&Call, F))
2143 return false;
2144
2145 // Try to re-map the arguments to constants.
2146 SmallVector<Constant *, 4> ConstantArgs;
2147 ConstantArgs.reserve(Call.arg_size());
2148 for (Value *I : Call.args()) {
2149 Constant *C = dyn_cast<Constant>(I);
2150 if (!C)
2151 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
2152 if (!C)
2153 return false; // This argument doesn't map to a constant.
2154
2155 ConstantArgs.push_back(C);
2156 }
2157 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2158 SimplifiedValues[&Call] = C;
2159 return true;
2160 }
2161
2162 return false;
2163}
2164
2165bool CallAnalyzer::visitCallBase(CallBase &Call) {
2166 if (!onCallBaseVisitStart(Call))
2167 return true;
2168
2169 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2170 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2171 // This aborts the entire analysis.
2172 ExposesReturnsTwice = true;
2173 return false;
2174 }
2175 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2176 ContainsNoDuplicateCall = true;
2177
2178 Function *F = Call.getCalledFunction();
2179 bool IsIndirectCall = !F;
2180 if (IsIndirectCall) {
2181 // Check if this happens to be an indirect function call to a known function
2182 // in this inline context. If not, we've done all we can.
2183 Value *Callee = Call.getCalledOperand();
2184 F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2185 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2186 onCallArgumentSetup(Call);
2187
2188 if (!Call.onlyReadsMemory())
2189 disableLoadElimination();
2190 return Base::visitCallBase(Call);
2191 }
2192 }
2193
2194 assert(F && "Expected a call to a known function");
2195
2196 // When we have a concrete function, first try to simplify it directly.
2197 if (simplifyCallSite(F, Call))
2198 return true;
2199
2200 // Next check if it is an intrinsic we know about.
2201 // FIXME: Lift this into part of the InstVisitor.
2202 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2203 switch (II->getIntrinsicID()) {
2204 default:
2205 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2206 disableLoadElimination();
2207 return Base::visitCallBase(Call);
2208
2209 case Intrinsic::load_relative:
2210 onLoadRelativeIntrinsic();
2211 return false;
2212
2213 case Intrinsic::memset:
2214 case Intrinsic::memcpy:
2215 case Intrinsic::memmove:
2216 disableLoadElimination();
2217 // SROA can usually chew through these intrinsics, but they aren't free.
2218 return false;
2219 case Intrinsic::icall_branch_funnel:
2220 case Intrinsic::localescape:
2221 HasUninlineableIntrinsic = true;
2222 return false;
2223 case Intrinsic::vastart:
2224 InitsVargArgs = true;
2225 return false;
2226 case Intrinsic::launder_invariant_group:
2227 case Intrinsic::strip_invariant_group:
2228 if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2229 SROAArgValues[II] = SROAArg;
2230 return true;
2231 case Intrinsic::is_constant:
2232 return simplifyIntrinsicCallIsConstant(Call);
2233 case Intrinsic::objectsize:
2234 return simplifyIntrinsicCallObjectSize(Call);
2235 }
2236 }
2237
2238 if (F == Call.getFunction()) {
2239 // This flag will fully abort the analysis, so don't bother with anything
2240 // else.
2241 IsRecursiveCall = true;
2242 if (!AllowRecursiveCall)
2243 return false;
2244 }
2245
2246 if (TTI.isLoweredToCall(F)) {
2247 onLoweredCall(F, Call, IsIndirectCall);
2248 }
2249
2250 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2251 disableLoadElimination();
2252 return Base::visitCallBase(Call);
2253}
2254
2255bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2256 // At least one return instruction will be free after inlining.
2257 bool Free = !HasReturn;
2258 HasReturn = true;
2259 return Free;
2260}
2261
2262bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2263 // We model unconditional branches as essentially free -- they really
2264 // shouldn't exist at all, but handling them makes the behavior of the
2265 // inliner more regular and predictable. Interestingly, conditional branches
2266 // which will fold away are also free.
2267 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
2268 isa_and_nonnull<ConstantInt>(
2269 SimplifiedValues.lookup(BI.getCondition()));
2270}
2271
2272bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2273 bool CheckSROA = SI.getType()->isPointerTy();
2274 Value *TrueVal = SI.getTrueValue();
2275 Value *FalseVal = SI.getFalseValue();
2276
2277 Constant *TrueC = dyn_cast<Constant>(TrueVal);
2278 if (!TrueC)
2279 TrueC = SimplifiedValues.lookup(TrueVal);
2280 Constant *FalseC = dyn_cast<Constant>(FalseVal);
2281 if (!FalseC)
2282 FalseC = SimplifiedValues.lookup(FalseVal);
2283 Constant *CondC =
2284 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2285
2286 if (!CondC) {
2287 // Select C, X, X => X
2288 if (TrueC == FalseC && TrueC) {
2289 SimplifiedValues[&SI] = TrueC;
2290 return true;
2291 }
2292
2293 if (!CheckSROA)
2294 return Base::visitSelectInst(SI);
2295
2296 std::pair<Value *, APInt> TrueBaseAndOffset =
2297 ConstantOffsetPtrs.lookup(TrueVal);
2298 std::pair<Value *, APInt> FalseBaseAndOffset =
2299 ConstantOffsetPtrs.lookup(FalseVal);
2300 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2301 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2302
2303 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2304 SROAArgValues[&SI] = SROAArg;
2305 return true;
2306 }
2307
2308 return Base::visitSelectInst(SI);
2309 }
2310
2311 // Select condition is a constant.
2312 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2313 : (CondC->isNullValue()) ? FalseVal
2314 : nullptr;
2315 if (!SelectedV) {
2316 // Condition is a vector constant that is not all 1s or all 0s. If all
2317 // operands are constants, ConstantFoldSelectInstruction() can handle the
2318 // cases such as select vectors.
2319 if (TrueC && FalseC) {
2320 if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2321 SimplifiedValues[&SI] = C;
2322 return true;
2323 }
2324 }
2325 return Base::visitSelectInst(SI);
2326 }
2327
2328 // Condition is either all 1s or all 0s. SI can be simplified.
2329 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2330 SimplifiedValues[&SI] = SelectedC;
2331 return true;
2332 }
2333
2334 if (!CheckSROA)
2335 return true;
2336
2337 std::pair<Value *, APInt> BaseAndOffset =
2338 ConstantOffsetPtrs.lookup(SelectedV);
2339 if (BaseAndOffset.first) {
2340 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2341
2342 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2343 SROAArgValues[&SI] = SROAArg;
2344 }
2345
2346 return true;
2347}
2348
2349bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2350 // We model unconditional switches as free, see the comments on handling
2351 // branches.
2352 if (isa<ConstantInt>(SI.getCondition()))
2353 return true;
2354 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
2355 if (isa<ConstantInt>(V))
2356 return true;
2357
2358 // Assume the most general case where the switch is lowered into
2359 // either a jump table, bit test, or a balanced binary tree consisting of
2360 // case clusters without merging adjacent clusters with the same
2361 // destination. We do not consider the switches that are lowered with a mix
2362 // of jump table/bit test/binary search tree. The cost of the switch is
2363 // proportional to the size of the tree or the size of jump table range.
2364 //
2365 // NB: We convert large switches which are just used to initialize large phi
2366 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2367 // inlining those. It will prevent inlining in cases where the optimization
2368 // does not (yet) fire.
2369
2370 unsigned JumpTableSize = 0;
2371 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2372 unsigned NumCaseCluster =
2373 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2374
2375 onFinalizeSwitch(JumpTableSize, NumCaseCluster);
2376 return false;
2377}
2378
2379bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2380 // We never want to inline functions that contain an indirectbr. This is
2381 // incorrect because all the blockaddress's (in static global initializers
2382 // for example) would be referring to the original function, and this
2383 // indirect jump would jump from the inlined copy of the function into the
2384 // original function which is extremely undefined behavior.
2385 // FIXME: This logic isn't really right; we can safely inline functions with
2386 // indirectbr's as long as no other function or global references the
2387 // blockaddress of a block within the current function.
2388 HasIndirectBr = true;
2389 return false;
2390}
2391
2392bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2393 // FIXME: It's not clear that a single instruction is an accurate model for
2394 // the inline cost of a resume instruction.
2395 return false;
2396}
2397
2398bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2399 // FIXME: It's not clear that a single instruction is an accurate model for
2400 // the inline cost of a cleanupret instruction.
2401 return false;
2402}
2403
2404bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2405 // FIXME: It's not clear that a single instruction is an accurate model for
2406 // the inline cost of a catchret instruction.
2407 return false;
2408}
2409
2410bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2411 // FIXME: It might be reasonably to discount the cost of instructions leading
2412 // to unreachable as they have the lowest possible impact on both runtime and
2413 // code size.
2414 return true; // No actual code is needed for unreachable.
2415}
2416
2417bool CallAnalyzer::visitInstruction(Instruction &I) {
2418 // Some instructions are free. All of the free intrinsics can also be
2419 // handled by SROA, etc.
2422 return true;
2423
2424 // We found something we don't understand or can't handle. Mark any SROA-able
2425 // values in the operand list as no longer viable.
2426 for (const Use &Op : I.operands())
2427 disableSROA(Op);
2428
2429 return false;
2430}
2431
2432/// Analyze a basic block for its contribution to the inline cost.
2433///
2434/// This method walks the analyzer over every instruction in the given basic
2435/// block and accounts for their cost during inlining at this callsite. It
2436/// aborts early if the threshold has been exceeded or an impossible to inline
2437/// construct has been detected. It returns false if inlining is no longer
2438/// viable, and true if inlining remains viable.
2440CallAnalyzer::analyzeBlock(BasicBlock *BB,
2441 SmallPtrSetImpl<const Value *> &EphValues) {
2442 for (Instruction &I : *BB) {
2443 // FIXME: Currently, the number of instructions in a function regardless of
2444 // our ability to simplify them during inline to constants or dead code,
2445 // are actually used by the vector bonus heuristic. As long as that's true,
2446 // we have to special case debug intrinsics here to prevent differences in
2447 // inlining due to debug symbols. Eventually, the number of unsimplified
2448 // instructions shouldn't factor into the cost computation, but until then,
2449 // hack around it here.
2450 // Similarly, skip pseudo-probes.
2451 if (I.isDebugOrPseudoInst())
2452 continue;
2453
2454 // Skip ephemeral values.
2455 if (EphValues.count(&I))
2456 continue;
2457
2458 ++NumInstructions;
2459 if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2460 ++NumVectorInstructions;
2461
2462 // If the instruction simplified to a constant, there is no cost to this
2463 // instruction. Visit the instructions using our InstVisitor to account for
2464 // all of the per-instruction logic. The visit tree returns true if we
2465 // consumed the instruction in any way, and false if the instruction's base
2466 // cost should count against inlining.
2467 onInstructionAnalysisStart(&I);
2468
2469 if (Base::visit(&I))
2470 ++NumInstructionsSimplified;
2471 else
2472 onMissedSimplification();
2473
2474 onInstructionAnalysisFinish(&I);
2475 using namespace ore;
2476 // If the visit this instruction detected an uninlinable pattern, abort.
2478 if (IsRecursiveCall && !AllowRecursiveCall)
2479 IR = InlineResult::failure("recursive");
2480 else if (ExposesReturnsTwice)
2481 IR = InlineResult::failure("exposes returns twice");
2482 else if (HasDynamicAlloca)
2483 IR = InlineResult::failure("dynamic alloca");
2484 else if (HasIndirectBr)
2485 IR = InlineResult::failure("indirect branch");
2486 else if (HasUninlineableIntrinsic)
2487 IR = InlineResult::failure("uninlinable intrinsic");
2488 else if (InitsVargArgs)
2489 IR = InlineResult::failure("varargs");
2490 if (!IR.isSuccess()) {
2491 if (ORE)
2492 ORE->emit([&]() {
2493 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2494 &CandidateCall)
2495 << NV("Callee", &F) << " has uninlinable pattern ("
2496 << NV("InlineResult", IR.getFailureReason())
2497 << ") and cost is not fully computed";
2498 });
2499 return IR;
2500 }
2501
2502 // If the caller is a recursive function then we don't want to inline
2503 // functions which allocate a lot of stack space because it would increase
2504 // the caller stack usage dramatically.
2505 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2506 auto IR =
2507 InlineResult::failure("recursive and allocates too much stack space");
2508 if (ORE)
2509 ORE->emit([&]() {
2510 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2511 &CandidateCall)
2512 << NV("Callee", &F) << " is "
2513 << NV("InlineResult", IR.getFailureReason())
2514 << ". Cost is not fully computed";
2515 });
2516 return IR;
2517 }
2518
2519 if (shouldStop())
2520 return InlineResult::failure(
2521 "Call site analysis is not favorable to inlining.");
2522 }
2523
2524 return InlineResult::success();
2525}
2526
2527/// Compute the base pointer and cumulative constant offsets for V.
2528///
2529/// This strips all constant offsets off of V, leaving it the base pointer, and
2530/// accumulates the total constant offset applied in the returned constant. It
2531/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2532/// no constant offsets applied.
2533ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2534 if (!V->getType()->isPointerTy())
2535 return nullptr;
2536
2537 unsigned AS = V->getType()->getPointerAddressSpace();
2538 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2539 APInt Offset = APInt::getZero(IntPtrWidth);
2540
2541 // Even though we don't look through PHI nodes, we could be called on an
2542 // instruction in an unreachable block, which may be on a cycle.
2544 Visited.insert(V);
2545 do {
2546 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2547 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2548 return nullptr;
2549 V = GEP->getPointerOperand();
2550 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
2551 V = cast<Operator>(V)->getOperand(0);
2552 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2553 if (GA->isInterposable())
2554 break;
2555 V = GA->getAliasee();
2556 } else {
2557 break;
2558 }
2559 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2560 } while (Visited.insert(V).second);
2561
2562 Type *IdxPtrTy = DL.getIndexType(V->getType());
2563 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2564}
2565
2566/// Find dead blocks due to deleted CFG edges during inlining.
2567///
2568/// If we know the successor of the current block, \p CurrBB, has to be \p
2569/// NextBB, the other successors of \p CurrBB are dead if these successors have
2570/// no live incoming CFG edges. If one block is found to be dead, we can
2571/// continue growing the dead block list by checking the successors of the dead
2572/// blocks to see if all their incoming edges are dead or not.
2573void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2574 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2575 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2576 // known successor which is not the one under exam.
2577 return (DeadBlocks.count(Pred) ||
2578 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2579 };
2580
2581 auto IsNewlyDead = [&](BasicBlock *BB) {
2582 // If all the edges to a block are dead, the block is also dead.
2583 return (!DeadBlocks.count(BB) &&
2585 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2586 };
2587
2588 for (BasicBlock *Succ : successors(CurrBB)) {
2589 if (Succ == NextBB || !IsNewlyDead(Succ))
2590 continue;
2592 NewDead.push_back(Succ);
2593 while (!NewDead.empty()) {
2594 BasicBlock *Dead = NewDead.pop_back_val();
2595 if (DeadBlocks.insert(Dead).second)
2596 // Continue growing the dead block lists.
2597 for (BasicBlock *S : successors(Dead))
2598 if (IsNewlyDead(S))
2599 NewDead.push_back(S);
2600 }
2601 }
2602}
2603
2604/// Analyze a call site for potential inlining.
2605///
2606/// Returns true if inlining this call is viable, and false if it is not
2607/// viable. It computes the cost and adjusts the threshold based on numerous
2608/// factors and heuristics. If this method returns false but the computed cost
2609/// is below the computed threshold, then inlining was forcibly disabled by
2610/// some artifact of the routine.
2611InlineResult CallAnalyzer::analyze() {
2612 ++NumCallsAnalyzed;
2613
2614 auto Result = onAnalysisStart();
2615 if (!Result.isSuccess())
2616 return Result;
2617
2618 if (F.empty())
2619 return InlineResult::success();
2620
2621 Function *Caller = CandidateCall.getFunction();
2622 // Check if the caller function is recursive itself.
2623 for (User *U : Caller->users()) {
2624 CallBase *Call = dyn_cast<CallBase>(U);
2625 if (Call && Call->getFunction() == Caller) {
2626 IsCallerRecursive = true;
2627 break;
2628 }
2629 }
2630
2631 // Populate our simplified values by mapping from function arguments to call
2632 // arguments with known important simplifications.
2633 auto CAI = CandidateCall.arg_begin();
2634 for (Argument &FAI : F.args()) {
2635 assert(CAI != CandidateCall.arg_end());
2636 if (Constant *C = dyn_cast<Constant>(CAI))
2637 SimplifiedValues[&FAI] = C;
2638
2639 Value *PtrArg = *CAI;
2640 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2641 ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2642
2643 // We can SROA any pointer arguments derived from alloca instructions.
2644 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2645 SROAArgValues[&FAI] = SROAArg;
2646 onInitializeSROAArg(SROAArg);
2647 EnabledSROAAllocas.insert(SROAArg);
2648 }
2649 }
2650 ++CAI;
2651 }
2652 NumConstantArgs = SimplifiedValues.size();
2653 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2654 NumAllocaArgs = SROAArgValues.size();
2655
2656 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2657 // the ephemeral values multiple times (and they're completely determined by
2658 // the callee, so this is purely duplicate work).
2660 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2661
2662 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2663 // adding basic blocks of the callee which can be proven to be dead for this
2664 // particular call site in order to get more accurate cost estimates. This
2665 // requires a somewhat heavyweight iteration pattern: we need to walk the
2666 // basic blocks in a breadth-first order as we insert live successors. To
2667 // accomplish this, prioritizing for small iterations because we exit after
2668 // crossing our threshold, we use a small-size optimized SetVector.
2672 BBSetVector BBWorklist;
2673 BBWorklist.insert(&F.getEntryBlock());
2674
2675 // Note that we *must not* cache the size, this loop grows the worklist.
2676 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2677 if (shouldStop())
2678 break;
2679
2680 BasicBlock *BB = BBWorklist[Idx];
2681 if (BB->empty())
2682 continue;
2683
2684 onBlockStart(BB);
2685
2686 // Disallow inlining a blockaddress with uses other than strictly callbr.
2687 // A blockaddress only has defined behavior for an indirect branch in the
2688 // same function, and we do not currently support inlining indirect
2689 // branches. But, the inliner may not see an indirect branch that ends up
2690 // being dead code at a particular call site. If the blockaddress escapes
2691 // the function, e.g., via a global variable, inlining may lead to an
2692 // invalid cross-function reference.
2693 // FIXME: pr/39560: continue relaxing this overt restriction.
2694 if (BB->hasAddressTaken())
2695 for (User *U : BlockAddress::get(&*BB)->users())
2696 if (!isa<CallBrInst>(*U))
2697 return InlineResult::failure("blockaddress used outside of callbr");
2698
2699 // Analyze the cost of this block. If we blow through the threshold, this
2700 // returns false, and we can bail on out.
2701 InlineResult IR = analyzeBlock(BB, EphValues);
2702 if (!IR.isSuccess())
2703 return IR;
2704
2705 Instruction *TI = BB->getTerminator();
2706
2707 // Add in the live successors by first checking whether we have terminator
2708 // that may be simplified based on the values simplified by this call.
2709 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2710 if (BI->isConditional()) {
2711 Value *Cond = BI->getCondition();
2712 if (ConstantInt *SimpleCond =
2713 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2714 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2715 BBWorklist.insert(NextBB);
2716 KnownSuccessors[BB] = NextBB;
2717 findDeadBlocks(BB, NextBB);
2718 continue;
2719 }
2720 }
2721 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2722 Value *Cond = SI->getCondition();
2723 if (ConstantInt *SimpleCond =
2724 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2725 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2726 BBWorklist.insert(NextBB);
2727 KnownSuccessors[BB] = NextBB;
2728 findDeadBlocks(BB, NextBB);
2729 continue;
2730 }
2731 }
2732
2733 // If we're unable to select a particular successor, just count all of
2734 // them.
2735 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
2736 ++TIdx)
2737 BBWorklist.insert(TI->getSuccessor(TIdx));
2738
2739 onBlockAnalyzed(BB);
2740 }
2741
2742 // If this is a noduplicate call, we can still inline as long as
2743 // inlining this would cause the removal of the caller (so the instruction
2744 // is not actually duplicated, just moved).
2745 if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
2746 return InlineResult::failure("noduplicate");
2747
2748 // If the callee's stack size exceeds the user-specified threshold,
2749 // do not let it be inlined.
2750 // The command line option overrides a limit set in the function attributes.
2751 size_t FinalStackSizeThreshold = StackSizeThreshold;
2752 if (!StackSizeThreshold.getNumOccurrences())
2753 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2755 FinalStackSizeThreshold = *AttrMaxStackSize;
2756 if (AllocatedSize > FinalStackSizeThreshold)
2757 return InlineResult::failure("stacksize");
2758
2759 return finalizeAnalysis();
2760}
2761
2762void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2763#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2765 F.print(OS, &Writer);
2766 DEBUG_PRINT_STAT(NumConstantArgs);
2767 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2768 DEBUG_PRINT_STAT(NumAllocaArgs);
2769 DEBUG_PRINT_STAT(NumConstantPtrCmps);
2770 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2771 DEBUG_PRINT_STAT(NumInstructionsSimplified);
2772 DEBUG_PRINT_STAT(NumInstructions);
2773 DEBUG_PRINT_STAT(SROACostSavings);
2774 DEBUG_PRINT_STAT(SROACostSavingsLost);
2775 DEBUG_PRINT_STAT(LoadEliminationCost);
2776 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2777 DEBUG_PRINT_STAT(Cost);
2778 DEBUG_PRINT_STAT(Threshold);
2779#undef DEBUG_PRINT_STAT
2780}
2781
2782#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2783/// Dump stats about this call's analysis.
2784LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2785#endif
2786
2787/// Test that there are no attribute conflicts between Caller and Callee
2788/// that prevent inlining.
2791 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2792 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2793 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2794 // object, and always returns the same object (which is overwritten on each
2795 // GetTLI call). Therefore we copy the first result.
2796 auto CalleeTLI = GetTLI(*Callee);
2797 return (IgnoreTTIInlineCompatible ||
2798 TTI.areInlineCompatible(Caller, Callee)) &&
2799 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2802}
2803
2805 int64_t Cost = 0;
2806 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2807 if (Call.isByValArgument(I)) {
2808 // We approximate the number of loads and stores needed by dividing the
2809 // size of the byval type by the target's pointer size.
2810 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2811 unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2812 unsigned AS = PTy->getAddressSpace();
2813 unsigned PointerSize = DL.getPointerSizeInBits(AS);
2814 // Ceiling division.
2815 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2816
2817 // If it generates more than 8 stores it is likely to be expanded as an
2818 // inline memcpy so we take that as an upper bound. Otherwise we assume
2819 // one load and one store per word copied.
2820 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2821 // here instead of a magic number of 8, but it's not available via
2822 // DataLayout.
2823 NumStores = std::min(NumStores, 8U);
2824
2825 Cost += 2 * NumStores * InstrCost;
2826 } else {
2827 // For non-byval arguments subtract off one instruction per call
2828 // argument.
2829 Cost += InstrCost;
2830 }
2831 }
2832 // The call instruction also disappears after inlining.
2833 Cost += InstrCost;
2834 Cost += CallPenalty;
2835 return std::min<int64_t>(Cost, INT_MAX);
2836}
2837
2839 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2840 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2841 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2844 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2845 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2846}
2847
2849 CallBase &Call, TargetTransformInfo &CalleeTTI,
2850 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2853 const InlineParams Params = {/* DefaultThreshold*/ 0,
2854 /*HintThreshold*/ {},
2855 /*ColdThreshold*/ {},
2856 /*OptSizeThreshold*/ {},
2857 /*OptMinSizeThreshold*/ {},
2858 /*HotCallSiteThreshold*/ {},
2859 /*LocallyHotCallSiteThreshold*/ {},
2860 /*ColdCallSiteThreshold*/ {},
2861 /*ComputeFullInlineCost*/ true,
2862 /*EnableDeferral*/ true};
2863
2864 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2865 GetAssumptionCache, GetBFI, PSI, ORE, true,
2866 /*IgnoreThreshold*/ true);
2867 auto R = CA.analyze();
2868 if (!R.isSuccess())
2869 return std::nullopt;
2870 return CA.getCost();
2871}
2872
2873std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2874 CallBase &Call, TargetTransformInfo &CalleeTTI,
2875 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2878 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2879 ORE, *Call.getCalledFunction(), Call);
2880 auto R = CFA.analyze();
2881 if (!R.isSuccess())
2882 return std::nullopt;
2883 return CFA.features();
2884}
2885
2887 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2888 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2889
2890 // Cannot inline indirect calls.
2891 if (!Callee)
2892 return InlineResult::failure("indirect call");
2893
2894 // When callee coroutine function is inlined into caller coroutine function
2895 // before coro-split pass,
2896 // coro-early pass can not handle this quiet well.
2897 // So we won't inline the coroutine function if it have not been unsplited
2898 if (Callee->isPresplitCoroutine())
2899 return InlineResult::failure("unsplited coroutine call");
2900
2901 // Never inline calls with byval arguments that does not have the alloca
2902 // address space. Since byval arguments can be replaced with a copy to an
2903 // alloca, the inlined code would need to be adjusted to handle that the
2904 // argument is in the alloca address space (so it is a little bit complicated
2905 // to solve).
2906 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2907 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2908 if (Call.isByValArgument(I)) {
2909 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2910 if (PTy->getAddressSpace() != AllocaAS)
2911 return InlineResult::failure("byval arguments without alloca"
2912 " address space");
2913 }
2914
2915 // Calls to functions with always-inline attributes should be inlined
2916 // whenever possible.
2917 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
2918 if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
2919 return InlineResult::failure("noinline call site attribute");
2920
2921 auto IsViable = isInlineViable(*Callee);
2922 if (IsViable.isSuccess())
2923 return InlineResult::success();
2924 return InlineResult::failure(IsViable.getFailureReason());
2925 }
2926
2927 // Never inline functions with conflicting attributes (unless callee has
2928 // always-inline attribute).
2929 Function *Caller = Call.getCaller();
2930 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
2931 return InlineResult::failure("conflicting attributes");
2932
2933 // Don't inline this call if the caller has the optnone attribute.
2934 if (Caller->hasOptNone())
2935 return InlineResult::failure("optnone attribute");
2936
2937 // Don't inline a function that treats null pointer as valid into a caller
2938 // that does not have this attribute.
2939 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2940 return InlineResult::failure("nullptr definitions incompatible");
2941
2942 // Don't inline functions which can be interposed at link-time.
2943 if (Callee->isInterposable())
2944 return InlineResult::failure("interposable");
2945
2946 // Don't inline functions marked noinline.
2947 if (Callee->hasFnAttribute(Attribute::NoInline))
2948 return InlineResult::failure("noinline function attribute");
2949
2950 // Don't inline call sites marked noinline.
2951 if (Call.isNoInline())
2952 return InlineResult::failure("noinline call site attribute");
2953
2954 return std::nullopt;
2955}
2956
2958 CallBase &Call, Function *Callee, const InlineParams &Params,
2959 TargetTransformInfo &CalleeTTI,
2960 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2961 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2964
2965 auto UserDecision =
2966 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
2967
2968 if (UserDecision) {
2969 if (UserDecision->isSuccess())
2970 return llvm::InlineCost::getAlways("always inline attribute");
2971 return llvm::InlineCost::getNever(UserDecision->getFailureReason());
2972 }
2973
2974 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
2975 << "... (caller:" << Call.getCaller()->getName()
2976 << ")\n");
2977
2978 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
2979 GetAssumptionCache, GetBFI, PSI, ORE);
2980 InlineResult ShouldInline = CA.analyze();
2981
2982 LLVM_DEBUG(CA.dump());
2983
2984 // Always make cost benefit based decision explicit.
2985 // We use always/never here since threshold is not meaningful,
2986 // as it's not what drives cost-benefit analysis.
2987 if (CA.wasDecidedByCostBenefit()) {
2988 if (ShouldInline.isSuccess())
2989 return InlineCost::getAlways("benefit over cost",
2990 CA.getCostBenefitPair());
2991 else
2992 return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
2993 }
2994
2995 if (CA.wasDecidedByCostThreshold())
2996 return InlineCost::get(CA.getCost(), CA.getThreshold(),
2997 CA.getStaticBonusApplied());
2998
2999 // No details on how the decision was made, simply return always or never.
3000 return ShouldInline.isSuccess()
3001 ? InlineCost::getAlways("empty function")
3002 : InlineCost::getNever(ShouldInline.getFailureReason());
3003}
3004
3006 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3007 for (BasicBlock &BB : F) {
3008 // Disallow inlining of functions which contain indirect branches.
3009 if (isa<IndirectBrInst>(BB.getTerminator()))
3010 return InlineResult::failure("contains indirect branches");
3011
3012 // Disallow inlining of blockaddresses which are used by non-callbr
3013 // instructions.
3014 if (BB.hasAddressTaken())
3015 for (User *U : BlockAddress::get(&BB)->users())
3016 if (!isa<CallBrInst>(*U))
3017 return InlineResult::failure("blockaddress used outside of callbr");
3018
3019 for (auto &II : BB) {
3020 CallBase *Call = dyn_cast<CallBase>(&II);
3021 if (!Call)
3022 continue;
3023
3024 // Disallow recursive calls.
3025 Function *Callee = Call->getCalledFunction();
3026 if (&F == Callee)
3027 return InlineResult::failure("recursive call");
3028
3029 // Disallow calls which expose returns-twice to a function not previously
3030 // attributed as such.
3031 if (!ReturnsTwice && isa<CallInst>(Call) &&
3032 cast<CallInst>(Call)->canReturnTwice())
3033 return InlineResult::failure("exposes returns-twice attribute");
3034
3035 if (Callee)
3036 switch (Callee->getIntrinsicID()) {
3037 default:
3038 break;
3039 case llvm::Intrinsic::icall_branch_funnel:
3040 // Disallow inlining of @llvm.icall.branch.funnel because current
3041 // backend can't separate call targets from call arguments.
3042 return InlineResult::failure(
3043 "disallowed inlining of @llvm.icall.branch.funnel");
3044 case llvm::Intrinsic::localescape:
3045 // Disallow inlining functions that call @llvm.localescape. Doing this
3046 // correctly would require major changes to the inliner.
3047 return InlineResult::failure(
3048 "disallowed inlining of @llvm.localescape");
3049 case llvm::Intrinsic::vastart:
3050 // Disallow inlining of functions that initialize VarArgs with
3051 // va_start.
3052 return InlineResult::failure(
3053 "contains VarArgs initialized with va_start");
3054 }
3055 }
3056 }
3057
3058 return InlineResult::success();
3059}
3060
3061// APIs to create InlineParams based on command line flags and/or other
3062// parameters.
3063
3065 InlineParams Params;
3066
3067 // This field is the threshold to use for a callee by default. This is
3068 // derived from one or more of:
3069 // * optimization or size-optimization levels,
3070 // * a value passed to createFunctionInliningPass function, or
3071 // * the -inline-threshold flag.
3072 // If the -inline-threshold flag is explicitly specified, that is used
3073 // irrespective of anything else.
3074 if (InlineThreshold.getNumOccurrences() > 0)
3076 else
3077 Params.DefaultThreshold = Threshold;
3078
3079 // Set the HintThreshold knob from the -inlinehint-threshold.
3081
3082 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3084
3085 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3086 // populate LocallyHotCallSiteThreshold. Later, we populate
3087 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3088 // we know that optimization level is O3 (in the getInlineParams variant that
3089 // takes the opt and size levels).
3090 // FIXME: Remove this check (and make the assignment unconditional) after
3091 // addressing size regression issues at O2.
3092 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3094
3095 // Set the ColdCallSiteThreshold knob from the
3096 // -inline-cold-callsite-threshold.
3098
3099 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3100 // -inlinehint-threshold commandline option is not explicitly given. If that
3101 // option is present, then its value applies even for callees with size and
3102 // minsize attributes.
3103 // If the -inline-threshold is not specified, set the ColdThreshold from the
3104 // -inlinecold-threshold even if it is not explicitly passed. If
3105 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3106 // explicitly specified to set the ColdThreshold knob
3107 if (InlineThreshold.getNumOccurrences() == 0) {
3111 } else if (ColdThreshold.getNumOccurrences() > 0) {
3113 }
3114 return Params;
3115}
3116
3119}
3120
3121// Compute the default threshold for inlining based on the opt level and the
3122// size opt level.
3123static int computeThresholdFromOptLevels(unsigned OptLevel,
3124 unsigned SizeOptLevel) {
3125 if (OptLevel > 2)
3127 if (SizeOptLevel == 1) // -Os
3129 if (SizeOptLevel == 2) // -Oz
3131 return DefaultThreshold;
3132}
3133
3134InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3135 auto Params =
3136 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3137 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3138 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3139 // when it is specified explicitly.
3140 if (OptLevel > 2)
3142 return Params;
3143}
3144
3149 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3150 [&](Function &F) -> AssumptionCache & {
3152 };
3153 Module *M = F.getParent();
3154 ProfileSummaryInfo PSI(*M);
3155 DataLayout DL(M);
3157 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3158 // In the current implementation, the type of InlineParams doesn't matter as
3159 // the pass serves only for verification of inliner's decisions.
3160 // We can add a flag which determines InlineParams for this run. Right now,
3161 // the default InlineParams are used.
3162 const InlineParams Params = llvm::getInlineParams();
3163 for (BasicBlock &BB : F) {
3164 for (Instruction &I : BB) {
3165 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
3166 Function *CalledFunction = CI->getCalledFunction();
3167 if (!CalledFunction || CalledFunction->isDeclaration())
3168 continue;
3169 OptimizationRemarkEmitter ORE(CalledFunction);
3170 InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3171 GetAssumptionCache, nullptr, &PSI, &ORE);
3172 ICCA.analyze();
3173 OS << " Analyzing call of " << CalledFunction->getName()
3174 << "... (caller:" << CI->getCaller()->getName() << ")\n";
3175 ICCA.print(OS);
3176 OS << "\n";
3177 }
3178 }
3179 }
3180 return PreservedAnalyses::all();
3181}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu Simplify well known AMD library false FunctionCallee Callee
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
SmallVector< MachineOperand, 4 > Cond
SetVector< BasicBlock * > BBSetVector
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:492
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI)
Return true if the block containing the call site has a BlockFrequency of less than ColdCCRelFreq% of...
Definition: GlobalOpt.cpp:1739
Hexagon Common GEP
iv users
Definition: IVUsers.cpp:48
static cl::opt< int > InlineSavingsMultiplier("inline-savings-multiplier", cl::Hidden, cl::init(8), cl::desc("Multiplier to multiply cycle savings by during inlining"))
static cl::opt< int > InlineThreshold("inline-threshold", cl::Hidden, cl::init(225), cl::desc("Control the amount of inlining to perform (default = 225)"))
static cl::opt< int > CallPenalty("inline-call-penalty", cl::Hidden, cl::init(25), cl::desc("Call penalty that is applied per callsite when inlining"))
static cl::opt< int > HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), cl::desc("Threshold for hot callsites "))
static cl::opt< int > ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining functions with cold attribute"))
static cl::opt< size_t > RecurStackSizeThreshold("recursive-inline-max-stacksize", cl::Hidden, cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller), cl::desc("Do not inline recursive functions with a stack " "size that exceeds the specified limit"))
static cl::opt< bool > PrintInstructionComments("print-instruction-comments", cl::Hidden, cl::init(false), cl::desc("Prints comments for instruction based on inline cost analysis"))
static cl::opt< int > LocallyHotCallSiteThreshold("locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::desc("Threshold for locally hot callsites "))
static cl::opt< bool > InlineCallerSupersetNoBuiltin("inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true), cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " "attributes."))
static cl::opt< int > HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint"))
static cl::opt< size_t > StackSizeThreshold("inline-max-stacksize", cl::Hidden, cl::init(std::numeric_limits< size_t >::max()), cl::desc("Do not inline functions with a stack size " "that exceeds the specified limit"))
static int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel)
static cl::opt< int > MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0), cl::desc("Cost of load/store instruction when inlining"))
static cl::opt< int > ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites"))
static cl::opt< bool > IgnoreTTIInlineCompatible("ignore-tti-inline-compatible", cl::Hidden, cl::init(false), cl::desc("Ignore TTI attributes compatibility check between callee/caller " "during inline cost calculation"))
static cl::opt< bool > OptComputeFullInlineCost("inline-cost-full", cl::Hidden, cl::desc("Compute the full inline cost of a call site even when the cost " "exceeds the threshold."))
#define DEBUG_PRINT_STAT(x)
static cl::opt< bool > InlineEnableCostBenefitAnalysis("inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false), cl::desc("Enable the cost-benefit analysis for the inliner"))
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static cl::opt< int > InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100), cl::desc("The maximum size of a callee that get's " "inlined without sufficient cycle savings"))
static bool functionsHaveCompatibleAttributes(Function *Caller, Function *Callee, TargetTransformInfo &TTI, function_ref< const TargetLibraryInfo &(Function &)> &GetTLI)
Test that there are no attribute conflicts between Caller and Callee that prevent inlining.
#define DEBUG_TYPE
Definition: InlineCost.cpp:52
static cl::opt< int > HotCallSiteRelFreq("hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::desc("Minimum block frequency, expressed as a multiple of caller's " "entry frequency, for a callsite to be hot in the absence of " "profile information."))
static cl::opt< int > ColdCallSiteRelFreq("cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::desc("Maximum block frequency, expressed as a percentage of caller's " "entry frequency, for a callsite to be cold in the absence of " "profile information."))
static cl::opt< bool > DisableGEPConstOperand("disable-gep-const-evaluation", cl::Hidden, cl::init(false), cl::desc("Disables evaluation of GetElementPtr with constant operands"))
static cl::opt< int > DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225), cl::desc("Default amount of inlining to perform"))
static Constant * getFalse(Type *Ty)
For a boolean type or a vector of boolean type, return false or a vector with every element false.
Statically lint checks LLVM IR
Definition: Lint.cpp:746
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
#define P(N)
FunctionAnalysisManager FAM
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:75
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1002
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:177
an instruction to allocate memory on the stack
Definition: Instructions.h:58
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
virtual void emitInstructionAnnot(const Instruction *, formatted_raw_ostream &)
emitInstructionAnnot - This may be implemented to emit a string right before an instruction is emitte...
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:317
AttrKind
This enumeration lists the attributes that can be associated with parameters, function results,...
Definition: Attributes.h:87
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:187
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
bool empty() const
Definition: BasicBlock.h:334
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches,...
Definition: BasicBlock.h:504
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
This class represents a no-op cast from one type to another.
static BlockAddress * get(Function *F, BasicBlock *BB)
Return a BlockAddress for the specified function and basic block.
Definition: Constants.cpp:1769
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
std::optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
BlockFrequency getBlockFreq(const BasicBlock *BB) const
getblockFreq - Return block frequency.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1186
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1408
bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Determine whether the argument or parameter has the given attribute.
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1328
Attribute getFnAttr(StringRef Kind) const
Get the attribute of a given kind for the function.
Definition: InstrTypes.h:1627
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1353
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1334
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1266
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:428
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:708
@ ICMP_NE
not equal
Definition: InstrTypes.h:740
static Constant * getSub(Constant *C1, Constant *C2, bool HasNUW=false, bool HasNSW=false)
Definition: Constants.cpp:2621
static Constant * getICmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced=false)
get* - Return some common constants without having to specify the full Instruction::OPCODE identifier...
Definition: Constants.cpp:2480
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:833
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:197
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:145
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:136
This is an important base class in LLVM.
Definition: Constant.h:41
bool isAllOnesValue() const
Return true if this is the value that would be returned by getAllOnesValue.
Definition: Constants.cpp:93
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:76
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:315
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
This instruction extracts a struct member or array element value from an aggregate value.
Type * getReturnType() const
Definition: DerivedTypes.h:124
Class to represent profile counts.
Definition: Function.h:252
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:275
Indirect Branch Instruction.
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
InlineResult is basically true or false.
Definition: InlineCost.h:179
static InlineResult success()
Definition: InlineCost.h:184
static InlineResult failure(const char *Reason)
Definition: InlineCost.h:185
bool isSuccess() const
Definition: InlineCost.h:188
const char * getFailureReason() const
Definition: InlineCost.h:189
This instruction inserts a struct field of array element value into an aggregate value.
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIndirectBrInst(IndirectBrInst &I)
Definition: InstVisitor.h:235
RetTy visitCmpInst(CmpInst &I)
Definition: InstVisitor.h:262
RetTy visitCallBase(CallBase &I)
Definition: InstVisitor.h:267
RetTy visitCleanupReturnInst(CleanupReturnInst &I)
Definition: InstVisitor.h:244
RetTy visitUnreachableInst(UnreachableInst &I)
Definition: InstVisitor.h:241
RetTy visitSwitchInst(SwitchInst &I)
Definition: InstVisitor.h:232
void visit(Iterator Start, Iterator End)
Definition: InstVisitor.h:87
RetTy visitReturnInst(ReturnInst &I)
Definition: InstVisitor.h:226
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:261
RetTy visitResumeInst(ResumeInst &I)
Definition: InstVisitor.h:238
RetTy visitCatchReturnInst(CatchReturnInst &I)
Definition: InstVisitor.h:247
RetTy visitCastInst(CastInst &I)
Definition: InstVisitor.h:259
RetTy visitBranchInst(BranchInst &I)
Definition: InstVisitor.h:229
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:189
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:280
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const BasicBlock * getParent() const
Definition: Instruction.h:90
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:74
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
This class represents a cast from an integer to a pointer.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Invoke instruction.
An instruction for reading from memory.
Definition: Instructions.h:177
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Class to represent pointers.
Definition: DerivedTypes.h:632
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:682
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
Analysis providing profile information.
This class represents a cast from a pointer to an integer.
Resume the propagation of an exception.
Return a value (possibly void), from a function.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:51
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:88
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:152
size_type size() const
Definition: SmallPtrSet.h:93
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
bool empty() const
Definition: SmallVector.h:94
void reserve(size_type N)
Definition: SmallVector.h:667
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:468
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition: DataLayout.h:623
uint64_t getElementOffset(unsigned Idx) const
Definition: DataLayout.h:653
Class to represent struct types.
Definition: DerivedTypes.h:213
Multiway switch.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
@ TCK_SizeAndLatency
The weighted sum of size and latency.
unsigned adjustInliningThreshold(const CallBase *CB) const
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
unsigned getInliningThresholdMultiplier() const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getFPOpCost(Type *Ty) const
Return the expected cost of supporting the floating point operation of the specified type.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
formatted_raw_ostream - A raw_ostream that wraps another one and keeps track of line and column posit...
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
bool areInlineCompatible(const Function &Caller, const Function &Callee)
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const int ColdccPenalty
Definition: InlineCost.h:51
const char FunctionInlineCostMultiplierAttributeName[]
Definition: InlineCost.h:59
const int OptSizeThreshold
Use when optsize (-Os) is specified.
Definition: InlineCost.h:38
const int OptMinSizeThreshold
Use when minsize (-Oz) is specified.
Definition: InlineCost.h:41
const int LoopPenalty
Definition: InlineCost.h:49
const uint64_t MaxSimplifiedDynamicAllocaToInline
Do not inline dynamic allocas that have been constant propagated to be static allocas above this amou...
Definition: InlineCost.h:57
const int IndirectCallThreshold
Definition: InlineCost.h:48
const int OptAggressiveThreshold
Use when -O3 is specified.
Definition: InlineCost.h:44
const char MaxInlineStackSizeAttributeName[]
Definition: InlineCost.h:62
const int LastCallToStaticBonus
Definition: InlineCost.h:50
const unsigned TotalAllocaSizeRecursiveCaller
Do not inline functions which allocate this many bytes on the stack when the caller is recursive.
Definition: InlineCost.h:54
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
@ Dead
Unused definition.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:62
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:406
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1819
Constant * ConstantFoldSelectInstruction(Constant *Cond, Constant *V1, Constant *V2)
Attempt to constant fold a select instruction with the specified operands.
bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
bool canConstantFoldCallTo(const CallBase *Call, const Function *F)
canConstantFoldCallTo - Return true if its even possible to fold a call to the specified function.
Function::ProfileCount ProfileCount
std::optional< int > getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind)
Definition: InlineCost.cpp:175
auto successors(const MachineBasicBlock *BB)
int getCallsiteCost(const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
Value * lowerObjectSizeCall(IntrinsicInst *ObjectSize, const DataLayout &DL, const TargetLibraryInfo *TLI, bool MustSucceed)
Try to turn a call to @llvm.objectsize into an integer value of the given Type.
std::optional< InlineCostFeatures > getInliningCostFeatures(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the expanded cost features.
gep_type_iterator gep_type_end(const User *GEP)
Constant * ConstantFoldCall(const CallBase *Call, Function *F, ArrayRef< Constant * > Operands, const TargetLibraryInfo *TLI=nullptr)
ConstantFoldCall - Attempt to constant fold a call to the specified function with the specified argum...
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Value * simplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q)
Given operand for an FNeg, fold the result or return null.
Constant * ConstantFoldInstOperands(Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
std::enable_if_t< std::is_unsigned< T >::value, T > SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed=nullptr)
Multiply two unsigned integers, X and Y, and add the unsigned integer, A to the product.
Definition: MathExtras.h:658
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::array< int, static_cast< size_t >(InlineCostFeatureIndex::NumberOfFeatures)> InlineCostFeatures
TargetTransformInfo TTI
std::optional< InlineResult > getAttributeBasedInliningDecision(CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, function_ref< const TargetLibraryInfo &(Function &)> GetTLI)
Returns InlineResult::success() if the call site should be always inlined because of user directives,...
Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q, OptimizationRemarkEmitter *ORE=nullptr)
See if we can compute a simplified version of this instruction.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
std::optional< int > getInliningCostEstimate(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the cost estimate ignoring thresholds.
gep_type_iterator gep_type_begin(const User *GEP)
auto predecessors(const MachineBasicBlock *BB)
std::enable_if_t< std::is_unsigned< T >::value, T > SaturatingAdd(T X, T Y, bool *ResultOverflowed=nullptr)
Add two unsigned integers, X and Y, of type T.
Definition: MathExtras.h:583
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< int > OptMinSizeThreshold
Threshold to use when the caller is optimized for minsize.
Definition: InlineCost.h:219
std::optional< int > OptSizeThreshold
Threshold to use when the caller is optimized for size.
Definition: InlineCost.h:216
std::optional< int > ColdCallSiteThreshold
Threshold to use when the callsite is considered cold.
Definition: InlineCost.h:229
std::optional< int > ColdThreshold
Threshold to use for cold callees.
Definition: InlineCost.h:213
std::optional< int > HotCallSiteThreshold
Threshold to use when the callsite is considered hot.
Definition: InlineCost.h:222
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
int DefaultThreshold
The default threshold to start with for a callee.
Definition: InlineCost.h:207
std::optional< int > HintThreshold
Threshold to use for callees with inline hint.
Definition: InlineCost.h:210
std::optional< int > LocallyHotCallSiteThreshold
Threshold to use when the callsite is considered hot relative to function entry.
Definition: InlineCost.h:226