Go to the documentation of this file.
61 #define DEBUG_TYPE "hotcoldsplit"
63 STATISTIC(NumColdRegionsFound,
"Number of cold regions found.");
64 STATISTIC(NumColdRegionsOutlined,
"Number of cold regions outlined.");
73 cl::desc(
"Base penalty for splitting cold code (as a "
74 "multiple of TCC_Basic)"));
78 cl::desc(
"Enable placement of extracted cold functions"
79 " into a separate section after hot-cold splitting."));
84 cl::desc(
"Name for the section containing cold functions "
85 "extracted by hot-cold splitting."));
89 cl::desc(
"Maximum number of parameters for a split function"));
104 return !(isa<ReturnInst>(
I) || isa<IndirectBrInst>(
I));
109 if (
BB.isEHPad() || isa<ResumeInst>(
BB.getTerminator()))
115 if (
auto *CB = dyn_cast<CallBase>(&
I))
123 dyn_cast_or_null<CallInst>(
BB.getTerminator()->getPrevNode()))
124 if (CI->hasFnAttr(Attribute::NoReturn))
140 if (
BB.hasAddressTaken() ||
BB.isEHPad())
142 auto Term =
BB.getTerminator();
143 return !isa<InvokeInst>(
Term) && !isa<ResumeInst>(
Term);
150 static bool markFunctionCold(
Function &
F,
bool UpdateEntryCount =
false) {
151 assert(!
F.hasOptNone() &&
"Can't mark this cold");
152 bool Changed =
false;
157 if (!
F.hasFnAttribute(Attribute::MinSize)) {
158 F.addFnAttr(Attribute::MinSize);
161 if (UpdateEntryCount) {
171 class HotColdSplittingLegacyPass :
public ModulePass {
185 bool runOnModule(
Module &M)
override;
191 bool HotColdSplitting::isFunctionCold(
const Function &
F)
const {
206 bool HotColdSplitting::shouldOutlineFrom(
const Function &
F)
const {
207 if (
F.hasFnAttribute(Attribute::AlwaysInline))
210 if (
F.hasFnAttribute(Attribute::NoInline))
215 if (
F.hasFnAttribute(Attribute::NoReturn))
218 if (
F.hasFnAttribute(Attribute::SanitizeAddress) ||
219 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
220 F.hasFnAttribute(Attribute::SanitizeThread) ||
221 F.hasFnAttribute(Attribute::SanitizeMemory))
235 if (&
I !=
BB->getTerminator())
244 unsigned NumInputs,
unsigned NumOutputs) {
246 LLVM_DEBUG(
dbgs() <<
"Applying penalty for splitting: " << Penalty <<
"\n");
255 bool NoBlocksReturn =
true;
261 NoBlocksReturn &= isa<UnreachableInst>(
BB->getTerminator());
267 NoBlocksReturn =
false;
268 SuccsOutsideRegion.
insert(SuccBB);
278 unsigned NumSplitExitPhis = 0;
279 for (
BasicBlock *ExitBB : SuccsOutsideRegion) {
280 for (
PHINode &PN : ExitBB->phis()) {
282 int NumIncomingVals = 0;
283 for (
unsigned i = 0;
i < PN.getNumIncomingValues(); ++
i)
286 if (NumIncomingVals > 1) {
296 int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
297 int NumParams = NumInputs + NumOutputsAndSplitPhis;
299 LLVM_DEBUG(
dbgs() << NumInputs <<
" inputs and " << NumOutputsAndSplitPhis
300 <<
" outputs exceeds parameter limit ("
305 LLVM_DEBUG(
dbgs() <<
"Applying penalty for: " << NumParams <<
" params\n");
306 Penalty += CostForArgMaterialization * NumParams;
310 LLVM_DEBUG(
dbgs() <<
"Applying penalty for: " << NumOutputsAndSplitPhis
311 <<
" outputs/split phis\n");
313 Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
316 if (NoBlocksReturn) {
318 <<
" non-returning terminators\n");
324 if (SuccsOutsideRegion.size() > 1) {
325 LLVM_DEBUG(
dbgs() <<
"Applying penalty for: " << SuccsOutsideRegion.size()
326 <<
" non-region successors\n");
333 Function *HotColdSplitting::extractColdRegion(
348 CE.findInputsOutputs(Inputs, Outputs, Sinks);
350 int OutliningPenalty =
352 LLVM_DEBUG(
dbgs() <<
"Split profitability: benefit = " << OutliningBenefit
353 <<
", penalty = " << OutliningPenalty <<
"\n");
354 if (!OutliningBenefit.
isValid() || OutliningBenefit <= OutliningPenalty)
358 if (
Function *OutF =
CE.extractCodeRegion(CEAC)) {
361 NumColdRegionsOutlined++;
375 markFunctionCold(*OutF,
BFI !=
nullptr);
381 <<
ore::NV(
"Original", OrigF) <<
" split cold code into "
390 <<
"Failed to extract region at block "
397 using BlockTy = std::pair<BasicBlock *, unsigned>;
404 class OutliningRegion {
416 bool EntireFunctionCold =
false;
419 static unsigned getEntryPointScore(
BasicBlock &
BB,
unsigned Score) {
420 return mayExtractBlock(
BB) ? Score : 0;
425 static constexpr
unsigned ScoreForSuccBlock = 1;
426 static constexpr
unsigned ScoreForSinkBlock = 1;
428 OutliningRegion(
const OutliningRegion &) =
delete;
429 OutliningRegion &operator=(
const OutliningRegion &) =
delete;
432 OutliningRegion() =
default;
433 OutliningRegion(OutliningRegion &&) =
default;
434 OutliningRegion &operator=(OutliningRegion &&) =
default;
436 static std::vector<OutliningRegion> create(
BasicBlock &SinkBB,
439 std::vector<OutliningRegion> Regions;
442 Regions.emplace_back();
443 OutliningRegion *ColdRegion = &Regions.back();
445 auto addBlockToRegion = [&](
BasicBlock *
BB,
unsigned Score) {
447 ColdRegion->Blocks.emplace_back(
BB, Score);
451 unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
452 ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB :
nullptr;
453 unsigned BestScore = SinkScore;
457 auto PredEnd =
idf_end(&SinkBB);
458 while (PredIt != PredEnd) {
460 bool SinkPostDom = PDT.
dominates(&SinkBB, &PredBB);
465 ColdRegion->EntireFunctionCold =
true;
471 if (!SinkPostDom || !mayExtractBlock(PredBB)) {
472 PredIt.skipChildren();
479 unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
480 if (PredScore > BestScore) {
481 ColdRegion->SuggestedEntryPoint = &PredBB;
482 BestScore = PredScore;
485 addBlockToRegion(&PredBB, PredScore);
495 if (mayExtractBlock(SinkBB)) {
496 addBlockToRegion(&SinkBB, SinkScore);
498 ColdRegion->EntireFunctionCold =
true;
502 Regions.emplace_back();
503 ColdRegion = &Regions.back();
509 auto SuccEnd =
df_end(&SinkBB);
510 while (SuccIt != SuccEnd) {
512 bool SinkDom = DT.
dominates(&SinkBB, &SuccBB);
515 bool DuplicateBlock = RegionBlocks.
count(&SuccBB);
519 if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
520 SuccIt.skipChildren();
524 unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
525 if (SuccScore > BestScore) {
526 ColdRegion->SuggestedEntryPoint = &SuccBB;
527 BestScore = SuccScore;
530 addBlockToRegion(&SuccBB, SuccScore);
538 bool empty()
const {
return !SuggestedEntryPoint; }
544 bool isEntireFunctionCold()
const {
return EntireFunctionCold; }
548 assert(!
empty() && !isEntireFunctionCold() &&
"Nothing to extract");
555 unsigned NextScore = 0;
556 auto RegionEndIt = Blocks.end();
559 unsigned Score = Block.second;
561 BB == SuggestedEntryPoint || DT.
dominates(SuggestedEntryPoint,
BB);
562 if (!InSubRegion && Score > NextScore) {
566 if (InSubRegion &&
BB != SuggestedEntryPoint)
567 SubRegion.push_back(
BB);
570 Blocks.
erase(RegionStartIt, RegionEndIt);
573 SuggestedEntryPoint = NextEntryPoint;
580 bool HotColdSplitting::outlineColdRegions(
Function &
F,
bool HasProfileSummary) {
581 bool Changed =
false;
595 std::unique_ptr<DominatorTree> DT;
596 std::unique_ptr<PostDominatorTree> PDT;
602 if (HasProfileSummary)
621 dbgs() <<
"Found a cold block:\n";
626 DT = std::make_unique<DominatorTree>(
F);
628 PDT = std::make_unique<PostDominatorTree>(
F);
630 auto Regions = OutliningRegion::create(*
BB, *DT, *PDT);
631 for (OutliningRegion &
Region : Regions) {
635 if (
Region.isEntireFunctionCold()) {
637 return markFunctionCold(
F);
646 return !ColdBlocks.insert(Block.first).second;
652 ++NumColdRegionsFound;
656 if (OutliningWorklist.empty())
660 unsigned OutlinedFunctionID = 1;
665 assert(!
Region.empty() &&
"Empty outlining region in worklist");
669 dbgs() <<
"Hot/cold splitting attempting to outline these blocks:\n";
674 Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT,
BFI,
TTI,
675 ORE, AC, OutlinedFunctionID);
677 ++OutlinedFunctionID;
680 }
while (!
Region.empty());
681 }
while (!OutliningWorklist.empty());
687 bool Changed =
false;
688 bool HasProfileSummary = (
M.getProfileSummary(
false) !=
nullptr);
691 if (
F.isDeclaration())
699 if (isFunctionCold(
F)) {
700 Changed |= markFunctionCold(
F);
704 if (!shouldOutlineFrom(
F)) {
710 Changed |= outlineColdRegions(
F, HasProfileSummary);
715 bool HotColdSplittingLegacyPass::runOnModule(
Module &M) {
719 &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
721 return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
724 return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(
F).getBFI();
726 std::unique_ptr<OptimizationRemarkEmitter> ORE;
733 if (
auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
734 return ACT->lookupAssumptionCache(
F);
758 std::unique_ptr<OptimizationRemarkEmitter> ORE;
774 "Hot Cold Splitting",
false,
false)
781 return new HotColdSplittingLegacyPass();
A set of analyses that are preserved following a run of a transformation pass.
Analysis pass providing the TargetTransformInfo.
This is an optimization pass for GlobalISel generic memory operations.
iterator erase(const_iterator CI)
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
std::pair< BasicBlock *, unsigned > BlockTy
A pair of (basic block, score).
Legacy analysis pass which computes BlockFrequencyInfo.
size_type size() const
Determine the number of elements in the SetVector.
StringRef getSection() const
Get the custom section of this global if it has one.
static cl::opt< std::string > ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"), cl::Hidden, cl::desc("Name for the section containing cold functions " "extracted by hot-cold splitting."))
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
df_iterator< T > df_end(const T &G)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
FunctionAnalysisManager FAM
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
DiagnosticInfoOptimizationBase::Argument NV
user_iterator user_begin()
auto successors(MachineBasicBlock *BB)
LLVM_NODISCARD T pop_back_val()
bool succ_empty(const Instruction *I)
LLVM Basic Block Representation.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
idf_iterator< T > idf_begin(const T &G)
auto remove_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly.
static cl::opt< int > SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden, cl::desc("Base penalty for splitting cold code (as a " "multiple of TCC_Basic)"))
idf_iterator< T > idf_end(const T &G)
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Represent the analysis usage information of a pass.
static cl::opt< bool > EnableStaticAnalysis("hot-cold-static-analysis", cl::init(true), cl::Hidden)
Analysis pass which computes BlockFrequencyInfo.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
ModulePass * createHotColdSplittingPass()
createHotColdSplittingPass - This pass outlines cold blocks into a separate function(s).
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Analysis providing profile information.
block_range blocks()
Returns a range view of the basic blocks in the region.
bool hasSection() const
Check if this global has a custom object file section.
static InstructionCost getOutliningBenefit(ArrayRef< BasicBlock * > Region, TargetTransformInfo &TTI)
Get the benefit score of outlining Region.
void initializeHotColdSplittingLegacyPassPass(PassRegistry &)
A function analysis which provides an AssumptionCache.
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
static cl::opt< bool > EnableColdSection("enable-cold-section", cl::init(false), cl::Hidden, cl::desc("Enable placement of extracted cold functions" " into a separate section after hot-cold splitting."))
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
initializer< Ty > init(const Ty &Val)
INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit", "Hot Cold Splitting", false, false) INITIALIZE_PASS_END(HotColdSplittingLegacyPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
df_iterator< T > df_begin(const T &G)
static int getOutliningPenalty(ArrayRef< BasicBlock * > Region, unsigned NumInputs, unsigned NumOutputs)
Get the penalty score for outlining Region.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
print Print MemDeps of function
@ CE
Windows NT (Windows on ARM)
A Module instance is used to store all the information related to an LLVM module.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
An immutable pass that tracks lazily created AssumptionCache objects.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
A cache of @llvm.assume calls within a function.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
bool pred_empty(const BasicBlock *BB)
STATISTIC(NumColdRegionsFound, "Number of cold regions found.")
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const
Returns true if BasicBlock BB is considered cold.
static cl::opt< int > MaxParametersForSplit("hotcoldsplit-max-params", cl::init(4), cl::Hidden, cl::desc("Maximum number of parameters for a split function"))
std::string to_string(const T &Value)
Align max(MaybeAlign Lhs, Align Rhs)
static bool blockEndsInUnreachable(const MachineBasicBlock *MBB)
A no successor, non-return block probably ends in unreachable and is cold.
A container for analyses that lazily runs them and caches their results.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
This class represents a function call, abstracting a target machine's calling convention.
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
AnalysisUsage & addRequired()
bool dominates(const Instruction *I1, const Instruction *I2) const
Return true if I1 dominates I2.
void setCallingConv(CallingConv::ID CC)
reference emplace_back(ArgTypes &&... Args)
RegionT * getParent() const
Get the parent of the Region.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.