63#define DEBUG_TYPE "partial-inlining"
66 "Number of callsites functions partially inlined into.");
67STATISTIC(NumColdOutlinePartialInlined,
"Number of times functions with "
68 "cold outlined regions were partially "
69 "inlined into its caller(s).");
71 "Number of cold single entry/exit regions found.");
73 "Number of cold single entry/exit regions outlined.");
83 cl::desc(
"Disable multi-region partial inlining"));
89 cl::desc(
"Force outline regions with live exits"));
95 cl::desc(
"Mark outline function calls with ColdCC"));
108 cl::desc(
"Minimum ratio comparing relative sizes of each "
109 "outline candidate and original function"));
114 cl::desc(
"Minimum block executions to consider "
115 "its BranchProbabilityInfo valid"));
120 cl::desc(
"Minimum BranchProbability to consider a region cold."));
124 cl::desc(
"Max number of blocks to be partially inlined"));
130 cl::desc(
"Max number of partial inlining. The default is unlimited"));
138 cl::desc(
"Relative frequency of outline region to "
143 cl::desc(
"A debug option to add additional penalty to the computed one."));
147struct FunctionOutliningInfo {
148 FunctionOutliningInfo() =
default;
152 unsigned getNumInlinedBlocks()
const {
return Entries.size() + 1; }
168struct FunctionOutliningMultiRegionInfo {
169 FunctionOutliningMultiRegionInfo() =
default;
172 struct OutlineRegionInfo {
174 BasicBlock *ExitBlock, BasicBlock *ReturnBlock)
175 : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock),
176 ReturnBlock(ReturnBlock) {}
177 SmallVector<BasicBlock *, 8> Region;
186struct PartialInlinerImpl {
189 function_ref<AssumptionCache &(Function &)> GetAC,
190 function_ref<AssumptionCache *(Function &)> LookupAC,
191 function_ref<TargetTransformInfo &(Function &)> GTTI,
192 function_ref<
const TargetLibraryInfo &(Function &)> GTLI,
193 ProfileSummaryInfo &ProfSI,
194 function_ref<BlockFrequencyInfo &(Function &)> GBFI =
nullptr)
195 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
196 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
206 std::pair<bool, Function *> unswitchFunction(Function &
F);
212 struct FunctionCloner {
215 FunctionCloner(Function *
F, FunctionOutliningInfo *OI,
216 OptimizationRemarkEmitter &ORE,
217 function_ref<AssumptionCache *(Function &)> LookupAC,
218 function_ref<TargetTransformInfo &(Function &)> GetTTI);
219 FunctionCloner(Function *
F, FunctionOutliningMultiRegionInfo *OMRI,
220 OptimizationRemarkEmitter &ORE,
221 function_ref<AssumptionCache *(Function &)> LookupAC,
222 function_ref<TargetTransformInfo &(Function &)> GetTTI);
229 void normalizeReturnBlock()
const;
232 bool doMultiRegionFunctionOutlining();
239 Function *doSingleRegionFunctionOutlining();
244 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
250 bool IsFunctionInlined =
false;
254 std::unique_ptr<FunctionOutliningInfo> ClonedOI =
nullptr;
256 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI =
nullptr;
257 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI =
nullptr;
258 OptimizationRemarkEmitter &ORE;
259 function_ref<AssumptionCache *(
Function &)> LookupAC;
260 function_ref<TargetTransformInfo &(
Function &)> GetTTI;
264 int NumPartialInlining = 0;
265 function_ref<AssumptionCache &(
Function &)> GetAssumptionCache;
266 function_ref<AssumptionCache *(
Function &)> LookupAssumptionCache;
267 function_ref<TargetTransformInfo &(
Function &)> GetTTI;
268 function_ref<BlockFrequencyInfo &(
Function &)> GetBFI;
269 function_ref<
const TargetLibraryInfo &(
Function &)> GetTLI;
270 ProfileSummaryInfo &PSI;
277 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner)
const;
281 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
282 BlockFrequency WeightedOutliningRcost,
283 OptimizationRemarkEmitter &ORE)
const;
288 bool tryPartialInline(FunctionCloner &Cloner);
293 computeCallsiteToProfCountMap(Function *DuplicateFunction,
294 DenseMap<User *, uint64_t> &SiteCountMap)
const;
296 bool isLimitReached()
const {
301 static CallBase *getSupportedCallBase(User *U) {
308 static CallBase *getOneCallSiteTo(Function &
F) {
310 return getSupportedCallBase(User);
313 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &
F)
const {
314 CallBase *CB = getOneCallSiteTo(
F);
317 return std::make_tuple(DLoc,
Block);
326 std::tuple<InstructionCost, InstructionCost>
327 computeOutliningCosts(FunctionCloner &Cloner)
const;
333 TargetTransformInfo *
TTI);
335 std::unique_ptr<FunctionOutliningInfo>
336 computeOutliningInfo(Function &
F)
const;
338 std::unique_ptr<FunctionOutliningMultiRegionInfo>
339 computeOutliningColdRegionsInfo(Function &
F,
340 OptimizationRemarkEmitter &ORE)
const;
345std::unique_ptr<FunctionOutliningMultiRegionInfo>
346PartialInlinerImpl::computeOutliningColdRegionsInfo(
352 BranchProbabilityInfo BPI(
F, LI);
353 std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
354 BlockFrequencyInfo *BFI;
356 ScopedBFI.reset(
new BlockFrequencyInfo(
F, BPI, LI));
357 BFI = ScopedBFI.get();
362 if (!PSI.hasInstrumentationProfile())
363 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
365 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
366 std::make_unique<FunctionOutliningMultiRegionInfo>();
369 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
371 for (
auto *
Block : BlockList) {
376 return OptimizationRemarkMissed(
DEBUG_TYPE,
"MultiExitRegion",
378 <<
"Region dominated by "
379 <<
ore::NV(
"Block", BlockList.front()->getName())
380 <<
" has more than one region exit edge.";
398 TargetTransformInfo *FTTI = &GetTTI(
F);
401 OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
403 LLVM_DEBUG(
dbgs() <<
"OverallFunctionCost = " << OverallFunctionCost
409 BranchProbability MinBranchProbability(
412 bool ColdCandidateFound =
false;
414 std::vector<BasicBlock *> DFS;
415 SmallPtrSet<BasicBlock *, 8> VisitedSet;
416 DFS.push_back(CurrEntry);
417 VisitedSet.
insert(CurrEntry);
425 while (!DFS.empty()) {
426 auto *ThisBB = DFS.back();
431 if (PSI.isColdBlock(ThisBB, BFI) ||
435 if (!VisitedSet.
insert(*SI).second)
439 BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
440 if (SuccProb > MinBranchProbability)
443 LLVM_DEBUG(
dbgs() <<
"Found cold edge: " << ThisBB->getName() <<
"->"
445 <<
"\nBranch Probability = " << SuccProb <<
"\n";);
447 SmallVector<BasicBlock *, 8> DominateVector;
448 DT.getDescendants(*SI, DominateVector);
450 "SI should be reachable and have at least itself as descendant");
453 if (!DominateVector.
front()->hasNPredecessors(1)) {
455 <<
" doesn't have a single predecessor in the "
456 "dominator tree\n";);
462 if (!(ExitBlock = IsSingleExit(DominateVector))) {
464 <<
" doesn't have a unique successor\n";);
469 for (
auto *BB : DominateVector)
470 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
477 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"TooCostly",
480 <<
" inline cost-savings smaller than "
481 <<
ore::NV(
"Cost", MinOutlineRegionCost);
484 LLVM_DEBUG(
dbgs() <<
"ABORT: Outline region cost is smaller than "
485 << MinOutlineRegionCost <<
"\n";);
497 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
498 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
499 OutliningInfo->ORI.push_back(RegInfo);
501 << DominateVector.front()->getName() <<
"\n";);
502 ColdCandidateFound =
true;
503 NumColdRegionsFound++;
507 if (ColdCandidateFound)
508 return OutliningInfo;
510 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
513std::unique_ptr<FunctionOutliningInfo>
514PartialInlinerImpl::computeOutliningInfo(Function &
F)
const {
518 return std::unique_ptr<FunctionOutliningInfo>();
531 if (IsReturnBlock(Succ1))
532 return std::make_tuple(Succ1, Succ2);
533 if (IsReturnBlock(Succ2))
534 return std::make_tuple(Succ2, Succ1);
536 return std::make_tuple<BasicBlock *, BasicBlock *>(
nullptr,
nullptr);
541 if (IsSuccessor(Succ1, Succ2))
542 return std::make_tuple(Succ1, Succ2);
543 if (IsSuccessor(Succ2, Succ1))
544 return std::make_tuple(Succ2, Succ1);
546 return std::make_tuple<BasicBlock *, BasicBlock *>(
nullptr,
nullptr);
549 std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
550 std::make_unique<FunctionOutliningInfo>();
553 bool CandidateFound =
false;
568 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
571 OutliningInfo->Entries.push_back(CurrEntry);
572 OutliningInfo->ReturnBlock = ReturnBlock;
573 OutliningInfo->NonReturnBlock = NonReturnBlock;
574 CandidateFound =
true;
579 std::tie(CommSucc,
OtherSucc) = GetCommonSucc(Succ1, Succ2);
584 OutliningInfo->Entries.push_back(CurrEntry);
589 return std::unique_ptr<FunctionOutliningInfo>();
593 assert(OutliningInfo->Entries[0] == &
F.front() &&
594 "Function Entry must be the first in Entries vector");
599 auto HasNonEntryPred = [Entries](
BasicBlock *BB) {
601 if (!Entries.count(Pred))
606 auto CheckAndNormalizeCandidate =
607 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
608 for (BasicBlock *
E : OutliningInfo->Entries) {
610 if (Entries.count(Succ))
612 if (Succ == OutliningInfo->ReturnBlock)
613 OutliningInfo->ReturnBlockPreds.push_back(
E);
614 else if (Succ != OutliningInfo->NonReturnBlock)
618 if (HasNonEntryPred(
E))
624 if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
625 return std::unique_ptr<FunctionOutliningInfo>();
630 BasicBlock *Cand = OutliningInfo->NonReturnBlock;
634 if (HasNonEntryPred(Cand))
641 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
642 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
649 OutliningInfo->Entries.push_back(Cand);
650 OutliningInfo->NonReturnBlock = NonReturnBlock;
651 OutliningInfo->ReturnBlockPreds.push_back(Cand);
652 Entries.insert(Cand);
655 return OutliningInfo;
660 if (
F.hasProfileData())
663 for (
auto *
E : OI.Entries) {
671BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
672 FunctionCloner &Cloner)
const {
673 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
675 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
676 auto OutliningCallFreq =
677 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
681 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
682 OutliningCallFreq = EntryFreq;
685 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
688 return OutlineRegionRelFreq;
702 if (OutlineRegionRelFreq < BranchProbability(45, 100))
703 return OutlineRegionRelFreq;
705 OutlineRegionRelFreq = std::max(
708 return OutlineRegionRelFreq;
711bool PartialInlinerImpl::shouldPartialInline(
712 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
713 OptimizationRemarkEmitter &ORE)
const {
717 assert(Callee == Cloner.ClonedFunc);
723 auto &CalleeTTI = GetTTI(*Callee);
724 bool RemarksEnabled =
725 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
729 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE :
nullptr);
733 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"AlwaysInline", &CB)
734 <<
NV(
"Callee", Cloner.OrigFunc)
735 <<
" should always be fully inlined, not partially";
742 return OptimizationRemarkMissed(
DEBUG_TYPE,
"NeverInline", &CB)
743 <<
NV(
"Callee", Cloner.OrigFunc) <<
" not partially inlined into "
744 <<
NV(
"Caller", Caller)
745 <<
" because it should never be inlined (cost=never)";
752 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"TooCostly", &CB)
753 <<
NV(
"Callee", Cloner.OrigFunc) <<
" not partially inlined into "
754 <<
NV(
"Caller", Caller) <<
" because too costly to inline (cost="
755 <<
NV(
"Cost", IC.
getCost()) <<
", threshold="
760 const DataLayout &
DL =
Caller->getDataLayout();
764 BlockFrequency NormWeightedSavings(NonWeightedSavings);
767 if (NormWeightedSavings < WeightedOutliningRcost) {
769 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"OutliningCallcostTooHigh",
771 <<
NV(
"Callee", Cloner.OrigFunc) <<
" not partially inlined into "
772 <<
NV(
"Caller", Caller) <<
" runtime overhead (overhead="
773 <<
NV(
"Overhead", (
unsigned)WeightedOutliningRcost.
getFrequency())
775 <<
NV(
"Savings", (
unsigned)NormWeightedSavings.getFrequency())
777 <<
" of making the outlined call is too high";
784 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"CanBePartiallyInlined", &CB)
785 <<
NV(
"Callee", Cloner.OrigFunc) <<
" can be partially inlined into "
786 <<
NV(
"Caller", Caller) <<
" with cost=" <<
NV(
"Cost", IC.
getCost())
797PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
798 TargetTransformInfo *
TTI) {
802 for (Instruction &
I : *BB) {
804 switch (
I.getOpcode()) {
805 case Instruction::BitCast:
806 case Instruction::PtrToInt:
807 case Instruction::IntToPtr:
808 case Instruction::Alloca:
809 case Instruction::PHI:
811 case Instruction::GetElementPtr:
819 if (
I.isLifetimeStartOrEnd())
830 FMF = FPMO->getFastMathFlags();
832 IntrinsicCostAttributes ICA(IID,
II->getType(), Tys, FMF);
857std::tuple<InstructionCost, InstructionCost>
858PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner)
const {
860 for (
auto FuncBBPair : Cloner.OutlinedFunctions) {
861 Function *OutlinedFunc = FuncBBPair.first;
862 BasicBlock* OutliningCallBB = FuncBBPair.second;
865 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
866 OutliningFuncCallCost +=
867 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
870 for (BasicBlock &BB : *OutlinedFunc)
871 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
873 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
874 "Outlined function cost should be no less than the outlined region");
879 OutlinedFunctionCost -=
883 OutliningFuncCallCost +
884 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
887 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
893void PartialInlinerImpl::computeCallsiteToProfCountMap(
894 Function *DuplicateFunction,
895 DenseMap<User *, uint64_t> &CallSiteToProfCountMap)
const {
899 std::unique_ptr<BlockFrequencyInfo> TempBFI;
900 BlockFrequencyInfo *CurrentCallerBFI =
nullptr;
905 DominatorTree DT(*Caller);
907 BranchProbabilityInfo BPI(*Caller, LI);
908 TempBFI.reset(
new BlockFrequencyInfo(*Caller, BPI, LI));
909 CurrentCallerBFI = TempBFI.get();
912 CurrentCallerBFI = &(GetBFI(*Caller));
916 for (User *User :
Users) {
917 CallBase *CB = getSupportedCallBase(User);
919 if (CurrentCaller != Caller) {
921 ComputeCurrBFI(Caller);
923 assert(CurrentCallerBFI &&
"CallerBFI is not set");
930 CallSiteToProfCountMap[
User] = 0;
934PartialInlinerImpl::FunctionCloner::FunctionCloner(
935 Function *
F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
936 function_ref<AssumptionCache *(Function &)> LookupAC,
937 function_ref<TargetTransformInfo &(Function &)> GetTTI)
938 : OrigFunc(
F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
939 ClonedOI = std::make_unique<FunctionOutliningInfo>();
952 ClonedOI->ReturnBlockPreds.push_back(NewE);
956 F->replaceAllUsesWith(ClonedFunc);
959PartialInlinerImpl::FunctionCloner::FunctionCloner(
960 Function *
F, FunctionOutliningMultiRegionInfo *OI,
964 : OrigFunc(
F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
965 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
973 for (
const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &
RegionInfo :
984 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
985 Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
986 ClonedOMRI->ORI.push_back(MappedRegionInfo);
990 F->replaceAllUsesWith(ClonedFunc);
993void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock()
const {
996 PHINode *FirstPhi =
nullptr;
997 while (
I != BB->end()) {
1018 BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1020 PHINode *FirstPhi = GetFirstPHI(PreReturn);
1021 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1026 auto IsTrivialPhi = [](PHINode *PN) ->
Value * {
1028 return PN->getIncomingValue(0);
1032 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1033 ClonedOI->ReturnBlock->getFirstNonPHIIt());
1036 SmallVector<Instruction *, 4> DeadPhis;
1037 while (
I != PreReturn->
end()) {
1046 Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt();
1049 for (BasicBlock *
E : ClonedOI->ReturnBlockPreds) {
1058 if (
auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1064 for (
auto *DP : DeadPhis)
1065 DP->eraseFromParent();
1067 for (
auto *
E : ClonedOI->ReturnBlockPreds)
1068 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1071bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1073 auto ComputeRegionCost =
1076 for (BasicBlock* BB : Region)
1077 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1081 assert(ClonedOMRI &&
"Expecting OutlineInfo for multi region outline");
1083 if (ClonedOMRI->ORI.empty())
1092 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1093 ClonedFuncBFI.reset(
new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1096 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1098 SetVector<Value *> Inputs, Outputs, Sinks;
1099 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1102 ComputeRegionCost(RegionInfo.Region);
1104 CodeExtractor
CE(RegionInfo.Region, &DT,
false,
1105 ClonedFuncBFI.get(), &BPI,
1106 LookupAC(*RegionInfo.EntryBlock->
getParent()),
1112 CE.findInputsOutputs(Inputs, Outputs, Sinks);
1115 dbgs() <<
"inputs: " << Inputs.
size() <<
"\n";
1116 dbgs() <<
"outputs: " << Outputs.
size() <<
"\n";
1117 for (
Value *value : Inputs)
1118 dbgs() <<
"value used in func: " << *value <<
"\n";
1119 for (
Value *output : Outputs)
1120 dbgs() <<
"instr used in func: " << *output <<
"\n";
1127 if (Function *OutlinedFunc =
CE.extractCodeRegion(CEAC)) {
1128 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1131 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1132 NumColdRegionsOutlined++;
1133 OutlinedRegionCost += CurrentOutlinedRegionCost;
1141 return OptimizationRemarkMissed(
DEBUG_TYPE,
"ExtractFailed",
1142 &RegionInfo.Region.
front()->front())
1143 <<
"Failed to extract region at block "
1148 return !OutlinedFunctions.empty();
1152PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1155 auto ToBeInlined = [&,
this](
BasicBlock *BB) {
1156 return BB == ClonedOI->ReturnBlock ||
1160 assert(ClonedOI &&
"Expecting OutlineInfo for single region outline");
1167 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1168 ClonedFuncBFI.reset(
new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1171 std::vector<BasicBlock *> ToExtract;
1172 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1173 ToExtract.push_back(ClonedOI->NonReturnBlock);
1174 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1175 ClonedOI->NonReturnBlock, ClonedFuncTTI);
1176 for (BasicBlock *BB :
depth_first(&ClonedFunc->getEntryBlock()))
1177 if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) {
1178 ToExtract.push_back(BB);
1183 OutlinedRegionCost += computeBBInlineCost(BB, ClonedFuncTTI);
1187 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1189 CodeExtractor(ToExtract, &DT,
false,
1190 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1195 .extractCodeRegion(CEAC);
1199 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->
getParent();
1201 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1204 return OptimizationRemarkMissed(
DEBUG_TYPE,
"ExtractFailed",
1205 &ToExtract.front()->front())
1206 <<
"Failed to extract region at block "
1207 <<
ore::NV(
"Block", ToExtract.front());
1210 return OutlinedFunc;
1213PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1217 ClonedFunc->eraseFromParent();
1218 if (!IsFunctionInlined) {
1221 for (
auto FuncBBPair : OutlinedFunctions) {
1223 Func->eraseFromParent();
1228std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &
F) {
1229 if (
F.hasAddressTaken())
1230 return {
false,
nullptr};
1233 if (
F.hasFnAttribute(Attribute::AlwaysInline))
1234 return {
false,
nullptr};
1236 if (
F.hasFnAttribute(Attribute::NoInline))
1237 return {
false,
nullptr};
1239 if (PSI.isFunctionEntryCold(&
F))
1240 return {
false,
nullptr};
1242 if (
F.users().empty())
1243 return {
false,
nullptr};
1245 OptimizationRemarkEmitter ORE(&
F);
1249 if (PSI.hasProfileSummary() &&
F.hasProfileData() &&
1251 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1252 computeOutliningColdRegionsInfo(
F, ORE);
1254 FunctionCloner Cloner(&
F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1257 dbgs() <<
"HotCountThreshold = " << PSI.getHotCountThreshold() <<
"\n";
1258 dbgs() <<
"ColdCountThreshold = " << PSI.getColdCountThreshold()
1262 bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1266 dbgs() <<
">>>>>> Outlined (Cloned) Function >>>>>>\n";
1267 Cloner.ClonedFunc->print(
dbgs());
1268 dbgs() <<
"<<<<<< Outlined (Cloned) Function <<<<<<\n";
1271 if (tryPartialInline(Cloner))
1272 return {
true,
nullptr};
1280 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(
F);
1282 return {
false,
nullptr};
1284 FunctionCloner Cloner(&
F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1285 Cloner.normalizeReturnBlock();
1287 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1289 if (!OutlinedFunction)
1290 return {
false,
nullptr};
1292 if (tryPartialInline(Cloner))
1293 return {
true, OutlinedFunction};
1295 return {
false,
nullptr};
1298bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1299 if (Cloner.OutlinedFunctions.empty())
1302 auto OutliningCosts = computeOutliningCosts(Cloner);
1308 "Expected valid costs");
1312 BranchProbability RelativeToEntryFreq;
1313 if (Cloner.ClonedOI)
1314 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1321 RelativeToEntryFreq = BranchProbability(0, 1);
1323 BlockFrequency WeightedRcost =
1324 BlockFrequency(NonWeightedRcost.
getValue()) * RelativeToEntryFreq;
1331 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1334 std::tie(DLoc,
Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1335 OrigFuncORE.emit([&]() {
1336 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"OutlineRegionTooSmall",
1338 <<
ore::NV(
"Function", Cloner.OrigFunc)
1339 <<
" not partially inlined into callers (Original Size = "
1340 <<
ore::NV(
"OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1341 <<
", Size of call sequence to outlined function = "
1342 <<
ore::NV(
"NewSize", SizeCost) <<
")";
1347 assert(Cloner.OrigFunc->users().empty() &&
1348 "F's users should all be replaced!");
1350 std::vector<User *>
Users(Cloner.ClonedFunc->user_begin(),
1351 Cloner.ClonedFunc->user_end());
1353 DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1354 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1355 if (CalleeEntryCount)
1356 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1358 uint64_t CalleeEntryCountV =
1359 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1361 bool AnyInline =
false;
1362 for (User *User :
Users) {
1363 CallBase *CB = getSupportedCallBase(User);
1365 if (isLimitReached())
1368 OptimizationRemarkEmitter CallerORE(CB->
getCaller());
1369 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1374 OptimizationRemark
OR(
DEBUG_TYPE,
"PartiallyInlined", CB);
1375 OR <<
ore::NV(
"Callee", Cloner.OrigFunc) <<
" partially inlined into "
1378 InlineFunctionInfo IFI(GetAssumptionCache, &PSI);
1383 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1391 if (CalleeEntryCountV) {
1392 if (
auto It = CallSiteToProfCountMap.
find(User);
1393 It != CallSiteToProfCountMap.
end()) {
1394 uint64_t CallSiteCount = It->second;
1395 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1400 NumPartialInlining++;
1402 if (Cloner.ClonedOI)
1403 NumPartialInlined++;
1405 NumColdOutlinePartialInlined++;
1409 Cloner.IsFunctionInlined =
true;
1410 if (CalleeEntryCount)
1411 Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1412 CalleeEntryCountV, CalleeEntryCount->getType()));
1413 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1414 OrigFuncORE.emit([&]() {
1415 return OptimizationRemark(
DEBUG_TYPE,
"PartiallyInlined", Cloner.OrigFunc)
1416 <<
"Partially inlined into at least one caller";
1423bool PartialInlinerImpl::run(
Module &M) {
1427 std::vector<Function *> Worklist;
1428 Worklist.reserve(
M.size());
1429 for (Function &
F : M)
1430 if (!
F.use_empty() && !
F.isDeclaration())
1431 Worklist.push_back(&
F);
1434 while (!Worklist.empty()) {
1435 Function *CurrFunc = Worklist.back();
1436 Worklist.pop_back();
1441 std::pair<bool, Function *>
Result = unswitchFunction(*CurrFunc);
1443 Worklist.push_back(
Result.second);
1476 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1477 GetTLI, PSI, GetBFI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
cl::opt< unsigned > MinBlockCounterExecution
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
iv Induction Variable Users
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
Machine Check Debug Module
uint64_t IntrinsicInst * II
static cl::opt< unsigned > MaxNumInlineBlocks("max-num-inline-blocks", cl::init(5), cl::Hidden, cl::desc("Max number of blocks to be partially inlined"))
static cl::opt< int > OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), cl::Hidden, cl::desc("Relative frequency of outline region to " "the entry block"))
static cl::opt< bool > MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, cl::desc("Mark outline function calls with ColdCC"))
static cl::opt< float > MinRegionSizeRatio("min-region-size-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum ratio comparing relative sizes of each " "outline candidate and original function"))
static cl::opt< bool > DisableMultiRegionPartialInline("disable-mr-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable multi-region partial inlining"))
cl::opt< unsigned > MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, cl::desc("Minimum block executions to consider " "its BranchProbabilityInfo valid"))
static cl::opt< int > MaxNumPartialInlining("max-partial-inlining", cl::init(-1), cl::Hidden, cl::desc("Max number of partial inlining. The default is unlimited"))
static cl::opt< bool > DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial inlining"))
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI)
static cl::opt< float > ColdBranchRatio("cold-branch-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum BranchProbability to consider a region cold."))
static cl::opt< bool > ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, cl::desc("Force outline regions with live exits"))
static cl::opt< unsigned > ExtraOutliningPenalty("partial-inlining-extra-penalty", cl::init(0), cl::Hidden, cl::desc("A debug option to add additional penalty to the computed one."))
static cl::opt< bool > SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::ReallyHidden, cl::desc("Skip Cost Analysis"))
FunctionAnalysisManager FAM
This file contains the declarations for profiling metadata utility functions.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
LLVM_ABI std::optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static LLVM_ABI BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
LLVM_ABI Function * getCaller()
Helper to get the caller (the parent function).
Conditional Branch instruction.
iterator find(const_arg_type_t< KeyT > Val)
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
void setCallingConv(CallingConv::ID CC)
int getCost() const
Get the inline cost estimate.
int getCostDelta() const
Get the cost delta from the threshold for inlining.
auto map(const Function &F) const -> InstructionCost
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
LLVM_ABI Value * removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty=true)
Remove an incoming value.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
PreservedAnalyses run(Module &M, ModuleAnalysisManager &)
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
size_type size() const
Determine the number of elements in the SetVector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ BR
Control flow instructions. These all have token chains.
@ BasicBlock
Various leaf nodes.
LLVM_ABI int getInstrCost()
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
NodeAddr< FuncNode * > Func
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
auto successors(const MachineBasicBlock *BB)
constexpr from_range_t from_range
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
LLVM_ABI InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, bool TrackInlineHistory=false, Function *ForwardVarArgsTo=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
This function inlines the called function into the basic block of the caller.
LLVM_ABI InlineResult isInlineViable(Function &Callee)
Check if it is mechanically possible to inline the function Callee, based on the contents of the func...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
auto succ_size(const MachineBasicBlock *BB)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
RNSuccIterator< NodeRef, BlockT, RegionT > succ_begin(NodeRef Node)
LLVM_ABI InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr, function_ref< EphemeralValuesCache &(Function &)> GetEphValuesCache=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
RNSuccIterator< NodeRef, BlockT, RegionT > succ_end(NodeRef Node)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
LLVM_ABI int getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
iterator_range< df_iterator< T > > depth_first(const T &G)
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
LLVM_ABI bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.