71#define DEBUG_TYPE "amdgpu-split-module"
77 "amdgpu-module-splitting-max-depth",
79 "maximum search depth. 0 forces a greedy approach. "
80 "warning: the algorithm is up to O(2^N), where N is the max depth."),
83static cl::opt<float> LargeFnFactor(
86 "when max depth is reached and we can no longer branch out, this "
87 "value determines if a function is worth merging into an already "
88 "existing partition to reduce code duplication. This is a factor "
89 "of the ideal partition size, e.g. 2.0 means we consider the "
90 "function for merging if its cost (including its callees) is 2x the "
91 "size of an ideal partition."));
93static cl::opt<float> LargeFnOverlapForMerge(
95 cl::desc(
"when a function is considered for merging into a partition that "
96 "already contains some of its callees, do the merge if at least "
97 "n% of the code it can reach is already present inside the "
98 "partition; e.g. 0.7 means only merge >70%"));
101 "amdgpu-module-splitting-no-externalize-globals",
cl::Hidden,
102 cl::desc(
"disables externalization of global variable with local linkage; "
103 "may cause globals to be duplicated which increases binary size"));
106 "amdgpu-module-splitting-no-externalize-address-taken",
cl::Hidden,
108 "disables externalization of functions whose addresses are taken"));
111 ModuleDotCfgOutput(
"amdgpu-module-splitting-print-module-dotcfg",
113 cl::desc(
"output file to write out the dotgraph "
114 "representation of the input module"));
117 "amdgpu-module-splitting-print-partition-summaries",
cl::Hidden,
118 cl::desc(
"output file to write out a summary of "
119 "the partitions created for each module"));
123 UseLockFile(
"amdgpu-module-splitting-serial-execution",
cl::Hidden,
124 cl::desc(
"use a lock file so only one process in the system "
125 "can run this pass at once. useful to avoid mangled "
126 "debug output in multithreaded environments."));
129 DebugProposalSearch(
"amdgpu-module-splitting-debug-proposal-search",
131 cl::desc(
"print all proposals received and whether "
132 "they were rejected or accepted"));
135struct SplitModuleTimer : NamedRegionTimer {
136 SplitModuleTimer(StringRef Name, StringRef
Desc)
137 : NamedRegionTimer(Name,
Desc,
DEBUG_TYPE,
"AMDGPU Module Splitting",
146using FunctionsCostMap = DenseMap<const Function *, CostType>;
147using GetTTIFn = function_ref<
const TargetTransformInfo &(Function &)>;
148static constexpr unsigned InvalidPID = -1;
153static auto formatRatioOf(CostType Num, CostType Dem) {
154 CostType DemOr1 = Dem ? Dem : 1;
155 return format(
"%0.2f", (
static_cast<double>(Num) / DemOr1) * 100);
166static bool isNonCopyable(
const Function &
F) {
167 return F.hasExternalLinkage() || !
F.isDefinitionExact() ||
173 if (GV.hasLocalLinkage()) {
181 GV.setName(
"__llvmsplit_unnamed");
190static CostType calculateFunctionCosts(GetTTIFn GetTTI,
Module &M,
191 FunctionsCostMap &CostMap) {
192 SplitModuleTimer SMT(
"calculateFunctionCosts",
"cost analysis");
194 LLVM_DEBUG(
dbgs() <<
"[cost analysis] calculating function costs\n");
195 CostType ModuleCost = 0;
196 [[maybe_unused]] CostType KernelCost = 0;
199 if (Fn.isDeclaration())
203 const auto &
TTI = GetTTI(Fn);
204 for (
const auto &BB : Fn) {
205 for (
const auto &
I : BB) {
210 CostType CostVal = Cost.isValid()
213 assert((FnCost + CostVal) >= FnCost &&
"Overflow!");
220 CostMap[&Fn] = FnCost;
221 assert((ModuleCost + FnCost) >= ModuleCost &&
"Overflow!");
222 ModuleCost += FnCost;
225 KernelCost += FnCost;
233 const CostType FnCost = ModuleCost - KernelCost;
234 dbgs() <<
" - total module cost is " << ModuleCost <<
". kernels cost "
235 <<
"" << KernelCost <<
" ("
236 <<
format(
"%0.2f", (
float(KernelCost) / ModuleCost) * 100)
237 <<
"% of the module), functions cost " << FnCost <<
" ("
238 <<
format(
"%0.2f", (
float(FnCost) / ModuleCost) * 100)
239 <<
"% of the module)\n";
246static bool canBeIndirectlyCalled(
const Function &
F) {
249 return !
F.hasLocalLinkage() ||
250 F.hasAddressTaken(
nullptr,
278 enum class EdgeKind :
uint8_t {
297 : Src(Src), Dst(Dst), Kind(Kind) {}
304 using EdgesVec = SmallVector<const Edge *, 0>;
306 using nodes_iterator =
const Node *
const *;
308 SplitGraph(
const Module &M,
const FunctionsCostMap &CostMap,
310 : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
312 void buildGraph(CallGraph &CG);
315 bool verifyGraph()
const;
318 bool empty()
const {
return Nodes.empty(); }
322 unsigned getNumNodes()
const {
return Nodes.size(); }
323 BitVector createNodesBitVector()
const {
return BitVector(Nodes.size()); }
325 const Module &getModule()
const {
return M; }
327 CostType getModuleCost()
const {
return ModuleCost; }
328 CostType
getCost(
const Function &
F)
const {
return CostMap.at(&
F); }
332 CostType calculateCost(
const BitVector &BV)
const;
337 Node &
getNode(DenseMap<const GlobalValue *, Node *> &Cache,
338 const GlobalValue &GV);
341 const Edge &createEdge(
Node &Src,
Node &Dst, EdgeKind EK);
344 const FunctionsCostMap &CostMap;
350 SpecificBumpPtrAllocator<Node> NodesPool;
356 std::is_trivially_destructible_v<Edge>,
357 "Edge must be trivially destructible to use the BumpPtrAllocator");
373class SplitGraph::Node {
374 friend class SplitGraph;
377 Node(
unsigned ID,
const GlobalValue &GV, CostType IndividualCost,
379 :
ID(
ID), GV(GV), IndividualCost(IndividualCost),
380 IsNonCopyable(IsNonCopyable), IsEntryFnCC(
false), IsGraphEntry(
false) {
387 unsigned getID()
const {
return ID; }
393 CostType getIndividualCost()
const {
return IndividualCost; }
395 bool isNonCopyable()
const {
return IsNonCopyable; }
396 bool isEntryFunctionCC()
const {
return IsEntryFnCC; }
402 bool isGraphEntryPoint()
const {
return IsGraphEntry; }
404 StringRef
getName()
const {
return GV.getName(); }
406 bool hasAnyIncomingEdges()
const {
return IncomingEdges.size(); }
407 bool hasAnyIncomingEdgesOfKind(EdgeKind EK)
const {
408 return any_of(IncomingEdges, [&](
const auto *
E) {
return E->Kind == EK; });
411 bool hasAnyOutgoingEdges()
const {
return OutgoingEdges.size(); }
412 bool hasAnyOutgoingEdgesOfKind(EdgeKind EK)
const {
413 return any_of(OutgoingEdges, [&](
const auto *
E) {
return E->Kind == EK; });
417 return IncomingEdges;
421 return OutgoingEdges;
424 bool shouldFollowIndirectCalls()
const {
return isEntryFunctionCC(); }
431 void visitAllDependencies(std::function<
void(
const Node &)> Visitor)
const;
440 void getDependencies(BitVector &BV)
const {
441 visitAllDependencies([&](
const Node &
N) { BV.set(
N.getID()); });
445 void markAsGraphEntry() { IsGraphEntry =
true; }
448 const GlobalValue &GV;
449 CostType IndividualCost;
450 bool IsNonCopyable : 1;
451 bool IsEntryFnCC : 1;
452 bool IsGraphEntry : 1;
456 EdgesVec IncomingEdges;
457 EdgesVec OutgoingEdges;
460void SplitGraph::Node::visitAllDependencies(
461 std::function<
void(
const Node &)> Visitor)
const {
462 const bool FollowIndirect = shouldFollowIndirectCalls();
465 DenseSet<const Node *> Seen;
466 SmallVector<const Node *, 8> WorkList({
this});
467 while (!WorkList.empty()) {
468 const Node *CurN = WorkList.pop_back_val();
469 if (
auto [It, Inserted] = Seen.insert(CurN); !Inserted)
474 for (
const Edge *
E : CurN->outgoing_edges()) {
475 if (!FollowIndirect &&
E->Kind == EdgeKind::IndirectCall)
477 WorkList.push_back(
E->Dst);
489static bool handleCalleesMD(
const Instruction &
I,
490 SetVector<Function *> &Callees) {
491 auto *MD =
I.getMetadata(LLVMContext::MD_callees);
495 for (
const auto &Op : MD->operands()) {
496 Function *Callee = mdconst::extract_or_null<Function>(Op);
499 Callees.insert(Callee);
505void SplitGraph::buildGraph(CallGraph &CG) {
506 SplitModuleTimer SMT(
"buildGraph",
"graph construction");
509 <<
"[build graph] constructing graph representation of the input\n");
517 DenseMap<const GlobalValue *, Node *> Cache;
518 SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
519 for (
const Function &Fn : M) {
520 if (Fn.isDeclaration())
524 SetVector<const Function *> DirectCallees;
525 bool CallsExternal =
false;
526 for (
auto &CGEntry : *CG[&Fn]) {
527 auto *CGNode = CGEntry.second;
528 if (
auto *Callee = CGNode->getFunction()) {
529 if (!Callee->isDeclaration())
530 DirectCallees.insert(Callee);
531 }
else if (CGNode == CG.getCallsExternalNode())
532 CallsExternal =
true;
538 LLVM_DEBUG(dbgs() <<
" [!] callgraph is incomplete for ";
539 Fn.printAsOperand(dbgs());
540 dbgs() <<
" - analyzing function\n");
542 SetVector<Function *> KnownCallees;
543 bool HasUnknownIndirectCall =
false;
546 const auto *CB = dyn_cast<CallBase>(&Inst);
547 if (!CB || CB->getCalledFunction())
552 if (CB->isInlineAsm()) {
553 LLVM_DEBUG(dbgs() <<
" found inline assembly\n");
557 if (handleCalleesMD(Inst, KnownCallees))
561 KnownCallees.clear();
565 HasUnknownIndirectCall =
true;
569 if (HasUnknownIndirectCall) {
570 LLVM_DEBUG(dbgs() <<
" indirect call found\n");
571 FnsWithIndirectCalls.push_back(&Fn);
572 }
else if (!KnownCallees.empty())
573 DirectCallees.insert_range(KnownCallees);
577 for (
const auto *Callee : DirectCallees)
578 createEdge(
N,
getNode(Cache, *Callee), EdgeKind::DirectCall);
580 if (canBeIndirectlyCalled(Fn))
581 IndirectlyCallableFns.push_back(&Fn);
585 for (
const Function *Fn : FnsWithIndirectCalls) {
586 for (
const Function *Candidate : IndirectlyCallableFns) {
589 createEdge(Src, Dst, EdgeKind::IndirectCall);
594 SmallVector<Node *, 16> CandidateEntryPoints;
595 BitVector NodesReachableByKernels = createNodesBitVector();
596 for (
Node *
N : Nodes) {
598 if (
N->isEntryFunctionCC()) {
599 N->markAsGraphEntry();
600 N->getDependencies(NodesReachableByKernels);
601 }
else if (!
N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
602 CandidateEntryPoints.push_back(
N);
605 for (
Node *
N : CandidateEntryPoints) {
611 if (!NodesReachableByKernels.test(
N->getID()))
612 N->markAsGraphEntry();
621bool SplitGraph::verifyGraph()
const {
622 unsigned ExpectedID = 0;
624 DenseSet<const Node *> SeenNodes;
625 DenseSet<const Function *> SeenFunctionNodes;
626 for (
const Node *
N : Nodes) {
627 if (
N->getID() != (ExpectedID++)) {
628 errs() <<
"Node IDs are incorrect!\n";
632 if (!SeenNodes.insert(
N).second) {
633 errs() <<
"Node seen more than once!\n";
638 errs() <<
"getNode doesn't return the right node\n";
642 for (
const Edge *
E :
N->IncomingEdges) {
643 if (!
E->Src || !
E->Dst || (
E->Dst !=
N) ||
644 (
find(
E->Src->OutgoingEdges,
E) ==
E->Src->OutgoingEdges.end())) {
645 errs() <<
"ill-formed incoming edges\n";
650 for (
const Edge *
E :
N->OutgoingEdges) {
651 if (!
E->Src || !
E->Dst || (
E->Src !=
N) ||
652 (
find(
E->Dst->IncomingEdges,
E) ==
E->Dst->IncomingEdges.end())) {
653 errs() <<
"ill-formed outgoing edges\n";
658 const Function &Fn =
N->getFunction();
659 if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
660 if (
N->hasAnyIncomingEdges()) {
661 errs() <<
"Kernels cannot have incoming edges\n";
666 if (Fn.isDeclaration()) {
667 errs() <<
"declarations shouldn't have nodes!\n";
671 auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
673 errs() <<
"one function has multiple nodes!\n";
678 if (ExpectedID != Nodes.size()) {
679 errs() <<
"Node IDs out of sync!\n";
683 if (createNodesBitVector().size() != getNumNodes()) {
684 errs() <<
"nodes bit vector doesn't have the right size!\n";
689 BitVector BV = createNodesBitVector();
691 if (
N->isGraphEntryPoint())
692 N->getDependencies(BV);
696 for (
const auto &Fn : M) {
697 if (!Fn.isDeclaration()) {
698 if (!SeenFunctionNodes.contains(&Fn)) {
699 errs() <<
"Fn has no associated node in the graph!\n";
706 errs() <<
"not all nodes are reachable through the graph's entry points!\n";
714CostType SplitGraph::calculateCost(
const BitVector &BV)
const {
716 for (
unsigned NodeID : BV.set_bits())
717 Cost +=
getNode(NodeID).getIndividualCost();
722SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
723 const GlobalValue &GV) {
724 auto &
N = Cache[&GV];
729 bool NonCopyable =
false;
730 if (
const Function *Fn = dyn_cast<Function>(&GV)) {
731 NonCopyable = isNonCopyable(*Fn);
732 Cost = CostMap.at(Fn);
734 N =
new (NodesPool.Allocate())
Node(Nodes.size(), GV, Cost, NonCopyable);
740const SplitGraph::Edge &SplitGraph::createEdge(
Node &Src,
Node &Dst,
742 const Edge *
E =
new (EdgesPool.Allocate<Edge>(1))
Edge(&Src, &Dst, EK);
743 Src.OutgoingEdges.push_back(
E);
744 Dst.IncomingEdges.push_back(
E);
763 SplitProposal(
const SplitGraph &SG,
unsigned MaxPartitions) : SG(&SG) {
764 Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
767 void setName(StringRef NewName) { Name = NewName; }
768 StringRef
getName()
const {
return Name; }
770 const BitVector &operator[](
unsigned PID)
const {
771 return Partitions[PID].second;
774 void add(
unsigned PID,
const BitVector &BV) {
775 Partitions[PID].second |= BV;
779 void print(raw_ostream &OS)
const;
784 unsigned findCheapestPartition()
const;
787 void calculateScores();
790 void verifyCompleteness()
const;
802 double getCodeSizeScore()
const {
return CodeSizeScore; }
816 double getBottleneckScore()
const {
return BottleneckScore; }
819 void updateScore(
unsigned PID) {
821 for (
auto &[PCost, Nodes] : Partitions) {
823 PCost = SG->calculateCost(Nodes);
829 double CodeSizeScore = 0.0;
831 double BottleneckScore = 0.0;
833 CostType TotalCost = 0;
835 const SplitGraph *SG =
nullptr;
838 std::vector<std::pair<CostType, BitVector>> Partitions;
841void SplitProposal::print(raw_ostream &OS)
const {
844 OS <<
"[proposal] " << Name <<
", total cost:" << TotalCost
845 <<
", code size score:" << format(
"%0.3f", CodeSizeScore)
846 <<
", bottleneck score:" << format(
"%0.3f", BottleneckScore) <<
'\n';
847 for (
const auto &[PID, Part] : enumerate(Partitions)) {
848 const auto &[Cost, NodeIDs] = Part;
849 OS <<
" - P" << PID <<
" nodes:" << NodeIDs.count() <<
" cost: " << Cost
850 <<
'|' << formatRatioOf(Cost, SG->getModuleCost()) <<
"%\n";
854unsigned SplitProposal::findCheapestPartition()
const {
855 assert(!Partitions.empty());
856 CostType CurCost = std::numeric_limits<CostType>::max();
857 unsigned CurPID = InvalidPID;
858 for (
const auto &[Idx, Part] : enumerate(Partitions)) {
859 if (Part.first <= CurCost) {
861 CurCost = Part.first;
864 assert(CurPID != InvalidPID);
868void SplitProposal::calculateScores() {
869 if (Partitions.empty())
873 CostType LargestPCost = 0;
874 for (
auto &[PCost, Nodes] : Partitions) {
875 if (PCost > LargestPCost)
876 LargestPCost = PCost;
879 CostType ModuleCost = SG->getModuleCost();
880 CodeSizeScore = double(TotalCost) / ModuleCost;
881 assert(CodeSizeScore >= 0.0);
883 BottleneckScore = double(LargestPCost) / ModuleCost;
885 CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
886 BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
890void SplitProposal::verifyCompleteness()
const {
891 if (Partitions.empty())
894 BitVector Result = Partitions[0].second;
895 for (
const auto &
P : drop_begin(Partitions))
897 assert(Result.all() &&
"some nodes are missing from this proposal!");
916class RecursiveSearchSplitting {
918 using SubmitProposalFn = function_ref<void(SplitProposal)>;
920 RecursiveSearchSplitting(
const SplitGraph &SG,
unsigned NumParts,
921 SubmitProposalFn SubmitProposal);
926 struct WorkListEntry {
927 WorkListEntry(
const BitVector &BV) : Cluster(BV) {}
929 unsigned NumNonEntryNodes = 0;
930 CostType TotalCost = 0;
931 CostType CostExcludingGraphEntryPoints = 0;
938 void setupWorkList();
949 void pickPartition(
unsigned Depth,
unsigned Idx, SplitProposal SP);
956 std::pair<unsigned, CostType>
957 findMostSimilarPartition(
const WorkListEntry &Entry,
const SplitProposal &SP);
959 const SplitGraph &SG;
961 SubmitProposalFn SubmitProposal;
965 CostType LargeClusterThreshold = 0;
966 unsigned NumProposalsSubmitted = 0;
967 SmallVector<WorkListEntry> WorkList;
970RecursiveSearchSplitting::RecursiveSearchSplitting(
971 const SplitGraph &SG,
unsigned NumParts, SubmitProposalFn SubmitProposal)
972 : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
977 report_fatal_error(
"[amdgpu-split-module] search depth of " +
978 Twine(MaxDepth) +
" is too high!");
979 LargeClusterThreshold =
980 (LargeFnFactor != 0.0)
981 ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
982 : std::numeric_limits<CostType>::max();
983 LLVM_DEBUG(dbgs() <<
"[recursive search] large cluster threshold set at "
984 << LargeClusterThreshold <<
"\n");
987void RecursiveSearchSplitting::run() {
989 SplitModuleTimer SMT(
"recursive_search_prepare",
"preparing worklist");
994 SplitModuleTimer SMT(
"recursive_search_pick",
"partitioning");
995 SplitProposal SP(SG, NumParts);
996 pickPartition(0, 0, std::move(SP));
1000void RecursiveSearchSplitting::setupWorkList() {
1008 EquivalenceClasses<unsigned> NodeEC;
1009 for (
const SplitGraph::Node *
N : SG.nodes()) {
1010 if (!
N->isGraphEntryPoint())
1013 NodeEC.insert(
N->getID());
1014 N->visitAllDependencies([&](
const SplitGraph::Node &Dep) {
1015 if (&Dep !=
N && Dep.isNonCopyable())
1016 NodeEC.unionSets(
N->getID(), Dep.getID());
1020 for (
const auto &
Node : NodeEC) {
1021 if (!
Node->isLeader())
1024 BitVector Cluster = SG.createNodesBitVector();
1025 for (
unsigned M : NodeEC.members(*
Node)) {
1026 const SplitGraph::Node &
N = SG.getNode(M);
1027 if (
N.isGraphEntryPoint())
1028 N.getDependencies(Cluster);
1030 WorkList.emplace_back(std::move(Cluster));
1034 for (WorkListEntry &Entry : WorkList) {
1035 for (
unsigned NodeID : Entry.Cluster.set_bits()) {
1036 const SplitGraph::Node &
N = SG.getNode(NodeID);
1037 const CostType Cost =
N.getIndividualCost();
1039 Entry.TotalCost += Cost;
1040 if (!
N.isGraphEntryPoint()) {
1041 Entry.CostExcludingGraphEntryPoints += Cost;
1042 ++Entry.NumNonEntryNodes;
1047 stable_sort(WorkList, [](
const WorkListEntry &
A,
const WorkListEntry &
B) {
1048 if (
A.TotalCost !=
B.TotalCost)
1049 return A.TotalCost >
B.TotalCost;
1051 if (
A.CostExcludingGraphEntryPoints !=
B.CostExcludingGraphEntryPoints)
1052 return A.CostExcludingGraphEntryPoints >
B.CostExcludingGraphEntryPoints;
1054 if (
A.NumNonEntryNodes !=
B.NumNonEntryNodes)
1055 return A.NumNonEntryNodes >
B.NumNonEntryNodes;
1057 return A.Cluster.count() >
B.Cluster.count();
1061 dbgs() <<
"[recursive search] worklist:\n";
1062 for (
const auto &[Idx, Entry] : enumerate(WorkList)) {
1063 dbgs() <<
" - [" << Idx <<
"]: ";
1064 for (
unsigned NodeID : Entry.Cluster.set_bits())
1065 dbgs() << NodeID <<
" ";
1066 dbgs() <<
"(total_cost:" << Entry.TotalCost
1067 <<
", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
1073void RecursiveSearchSplitting::pickPartition(
unsigned Depth,
unsigned Idx,
1075 while (Idx < WorkList.size()) {
1078 const WorkListEntry &Entry = WorkList[Idx];
1079 const BitVector &Cluster = Entry.Cluster;
1083 const unsigned CheapestPID = SP.findCheapestPartition();
1084 assert(CheapestPID != InvalidPID);
1088 const auto [MostSimilarPID, SimilarDepsCost] =
1089 findMostSimilarPartition(Entry, SP);
1093 unsigned SinglePIDToTry = InvalidPID;
1094 if (MostSimilarPID == InvalidPID)
1095 SinglePIDToTry = CheapestPID;
1096 else if (MostSimilarPID == CheapestPID)
1097 SinglePIDToTry = CheapestPID;
1098 else if (Depth >= MaxDepth) {
1101 if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
1103 assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
1104 const double Ratio =
static_cast<double>(SimilarDepsCost) /
1105 Entry.CostExcludingGraphEntryPoints;
1106 assert(Ratio >= 0.0 && Ratio <= 1.0);
1107 if (Ratio > LargeFnOverlapForMerge) {
1112 SinglePIDToTry = MostSimilarPID;
1115 SinglePIDToTry = CheapestPID;
1122 if (SinglePIDToTry != InvalidPID) {
1123 LLVM_DEBUG(dbgs() << Idx <<
"=P" << SinglePIDToTry <<
' ');
1125 SP.add(SinglePIDToTry, Cluster);
1130 assert(MostSimilarPID != InvalidPID);
1139 SplitProposal BranchSP = SP;
1141 <<
" [lb] " << Idx <<
"=P" << CheapestPID <<
"? ");
1142 BranchSP.add(CheapestPID, Cluster);
1143 pickPartition(Depth + 1, Idx + 1, std::move(BranchSP));
1148 SplitProposal BranchSP = SP;
1150 <<
" [ms] " << Idx <<
"=P" << MostSimilarPID <<
"? ");
1151 BranchSP.add(MostSimilarPID, Cluster);
1152 pickPartition(Depth + 1, Idx + 1, std::move(BranchSP));
1160 assert(Idx == WorkList.size());
1161 assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
1162 "Search got out of bounds?");
1163 SP.setName(
"recursive_search (depth=" + std::to_string(Depth) +
") #" +
1164 std::to_string(NumProposalsSubmitted++));
1166 SubmitProposal(std::move(SP));
1169std::pair<unsigned, CostType>
1170RecursiveSearchSplitting::findMostSimilarPartition(
const WorkListEntry &Entry,
1171 const SplitProposal &SP) {
1172 if (!Entry.NumNonEntryNodes)
1173 return {InvalidPID, 0};
1178 unsigned ChosenPID = InvalidPID;
1179 CostType ChosenCost = 0;
1180 for (
unsigned PID = 0; PID < NumParts; ++PID) {
1181 BitVector BV = SP[PID];
1182 BV &= Entry.Cluster;
1187 const CostType Cost = SG.calculateCost(BV);
1189 if (ChosenPID == InvalidPID || ChosenCost < Cost ||
1190 (ChosenCost == Cost && PID > ChosenPID)) {
1196 return {ChosenPID, ChosenCost};
1203const SplitGraph::Node *mapEdgeToDst(
const SplitGraph::Edge *
E) {
1207using SplitGraphEdgeDstIterator =
1208 mapped_iterator<SplitGraph::edges_iterator,
decltype(&mapEdgeToDst)>;
1223 return {
Ref->outgoing_edges().begin(), mapEdgeToDst};
1226 return {
Ref->outgoing_edges().end(), mapEdgeToDst};
1230 return G.nodes().begin();
1233 return G.nodes().end();
1241 return SG.getModule().getName().str();
1245 return N->getName().str();
1249 const SplitGraph &SG) {
1251 if (
N->isEntryFunctionCC())
1252 Result +=
"entry-fn-cc ";
1253 if (
N->isNonCopyable())
1254 Result +=
"non-copyable ";
1255 Result +=
"cost:" + std::to_string(
N->getIndividualCost());
1260 const SplitGraph &SG) {
1261 return N->hasAnyIncomingEdges() ?
"" :
"color=\"red\"";
1265 SplitGraphEdgeDstIterator EI,
1266 const SplitGraph &SG) {
1268 switch ((*EI.getCurrent())->Kind) {
1269 case SplitGraph::EdgeKind::DirectCall:
1271 case SplitGraph::EdgeKind::IndirectCall:
1272 return "style=\"dashed\"";
1287static bool needsConservativeImport(
const GlobalValue *GV) {
1288 if (
const auto *Var = dyn_cast<GlobalVariable>(GV))
1289 return Var->hasLocalLinkage();
1290 if (
const auto *GA = dyn_cast<GlobalAlias>(GV))
1291 return GA->hasLocalLinkage();
1297static void printPartitionSummary(raw_ostream &OS,
unsigned N,
const Module &M,
1298 unsigned PartCost,
unsigned ModuleCost) {
1299 OS <<
"*** Partition P" <<
N <<
" ***\n";
1301 for (
const auto &Fn : M) {
1302 if (!Fn.isDeclaration())
1303 OS <<
" - [function] " << Fn.getName() <<
"\n";
1306 for (
const auto &GV :
M.globals()) {
1307 if (GV.hasInitializer())
1308 OS <<
" - [global] " << GV.getName() <<
"\n";
1311 OS <<
"Partition contains " << formatRatioOf(PartCost, ModuleCost)
1312 <<
"% of the source\n";
1315static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
1316 SplitModuleTimer SMT(
"proposal_evaluation",
"proposal ranking algorithm");
1319 New.verifyCompleteness();
1320 if (DebugProposalSearch)
1324 const double CurBScore = Best.getBottleneckScore();
1325 const double CurCSScore = Best.getCodeSizeScore();
1326 const double NewBScore =
New.getBottleneckScore();
1327 const double NewCSScore =
New.getCodeSizeScore();
1339 bool IsBest =
false;
1340 if (NewBScore < CurBScore)
1342 else if (NewBScore == CurBScore)
1343 IsBest = (NewCSScore < CurCSScore);
1346 Best = std::move(New);
1350 dbgs() <<
"[search] new best proposal!\n";
1352 dbgs() <<
"[search] discarding - not profitable\n";
1357static std::unique_ptr<Module> cloneAll(
const Module &M) {
1359 return CloneModule(M, VMap, [&](
const GlobalValue *GV) {
return true; });
1363static void writeDOTGraph(
const SplitGraph &SG) {
1364 if (ModuleDotCfgOutput.empty())
1368 raw_fd_ostream OS(ModuleDotCfgOutput, EC);
1370 errs() <<
"[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
1371 <<
"' - DOTGraph will not be printed\n";
1374 SG.getModule().getName());
1377static void splitAMDGPUModule(
1378 GetTTIFn GetTTI,
Module &M,
unsigned NumParts,
1379 function_ref<
void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1399 if (!NoExternalizeOnAddrTaken) {
1400 for (
auto &Fn : M) {
1401 if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
1403 dbgs() <<
" because its address is taken\n");
1411 if (!NoExternalizeGlobals) {
1412 for (
auto &GV :
M.globals()) {
1413 if (GV.hasLocalLinkage())
1414 LLVM_DEBUG(
dbgs() <<
"[externalize] GV " << GV.getName() <<
'\n');
1419 for (
auto &GA :
M.aliases()) {
1420 if (GA.hasLocalLinkage()) {
1421 LLVM_DEBUG(
dbgs() <<
"[externalize] alias " << GA.getName() <<
'\n');
1428 FunctionsCostMap FnCosts;
1429 const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
1433 SplitGraph SG(M, FnCosts, ModuleCost);
1439 <<
"[!] no nodes in graph, input is empty - no splitting possible\n");
1440 ModuleCallback(cloneAll(M));
1445 dbgs() <<
"[graph] nodes:\n";
1446 for (
const SplitGraph::Node *
N : SG.nodes()) {
1447 dbgs() <<
" - [" <<
N->getID() <<
"]: " <<
N->getName() <<
" "
1448 << (
N->isGraphEntryPoint() ?
"(entry)" :
"") <<
" "
1449 << (
N->isNonCopyable() ?
"(noncopyable)" :
"") <<
"\n";
1457 std::optional<SplitProposal> Proposal;
1458 const auto EvaluateProposal = [&](SplitProposal
SP) {
1459 SP.calculateScores();
1461 Proposal = std::move(SP);
1463 evaluateProposal(*Proposal, std::move(SP));
1468 RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
1469 LLVM_DEBUG(
if (Proposal)
dbgs() <<
"[search done] selected proposal: "
1470 << Proposal->getName() <<
"\n";);
1473 LLVM_DEBUG(
dbgs() <<
"[!] no proposal made, no splitting possible!\n");
1474 ModuleCallback(cloneAll(M));
1480 std::optional<raw_fd_ostream> SummariesOS;
1481 if (!PartitionSummariesOutput.empty()) {
1483 SummariesOS.emplace(PartitionSummariesOutput, EC);
1485 errs() <<
"[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
1486 <<
"' - Partition summaries will not be printed\n";
1491 bool ImportAllGVs =
true;
1493 for (
unsigned PID = 0; PID < NumParts; ++PID) {
1494 SplitModuleTimer SMT2(
"modules_creation",
1495 "creating modules for each partition");
1498 DenseSet<const Function *> FnsInPart;
1499 for (
unsigned NodeID : (*Proposal)[PID].set_bits())
1500 FnsInPart.insert(&SG.getNode(NodeID).getFunction());
1503 if (FnsInPart.empty()) {
1505 <<
" is empty, not creating module\n");
1510 CostType PartCost = 0;
1511 std::unique_ptr<Module> MPart(
1514 if (
const auto *Fn = dyn_cast<Function>(GV)) {
1515 if (FnsInPart.contains(Fn)) {
1516 PartCost += SG.getCost(*Fn);
1523 if (
const auto *GA = dyn_cast<GlobalAlias>(GV)) {
1524 if (
const auto *Fn = dyn_cast<Function>(GA->getAliaseeObject()))
1525 return FnsInPart.contains(Fn);
1529 return ImportAllGVs || needsConservativeImport(GV);
1532 ImportAllGVs =
false;
1536 if (needsConservativeImport(&GV) && GV.use_empty())
1537 GV.eraseFromParent();
1541 printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
1544 printPartitionSummary(
dbgs(), PID, *MPart, PartCost, ModuleCost));
1546 ModuleCallback(std::move(MPart));
1553 SplitModuleTimer SMT(
1554 "total",
"total pass runtime (incl. potentially waiting for lockfile)");
1577 dbgs() <<
"[amdgpu-split-module] unable to acquire lockfile, debug "
1578 "output may be mangled by other processes\n");
1579 }
else if (!Owned) {
1588 <<
"[amdgpu-split-module] unable to acquire lockfile, debug "
1589 "output may be mangled by other processes\n");
1595 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
1603 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
The AMDGPU TargetMachine interface definition for hw codegen targets.
Unify divergent function exit nodes
This file defines the BumpPtrAllocator interface.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI)
This file defines the DenseMap class.
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
This file defines the little GraphTraits<X> template class that should be specialized by classes that...
Module.h This file contains the declarations for the Module class.
Machine Check Debug Module
FunctionAnalysisManager FAM
ModuleAnalysisManager MAM
This header defines classes/functions to handle pass execution timing information with interfaces for...
static StringRef getName(Value *V)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines the SmallVector class.
static void externalize(GlobalValue *GV)
static const BasicSubtargetSubTypeKV * find(StringRef S, ArrayRef< BasicSubtargetSubTypeKV > A)
Find KV in array using binary search.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM)
Lightweight error class with error context and mandatory checking.
@ HiddenVisibility
The GV is hidden.
@ ExternalLinkage
Externally visible function.
static InstructionCost getMax()
Class that manages the creation of a lock file to aid implicit coordination between different process...
std::error_code unsafeUnlock() override
Remove the lock file.
WaitForUnlockResult waitForUnlockFor(std::chrono::seconds MaxSeconds) override
For a shared lock, wait until the owner releases the lock.
Expected< bool > tryLock() override
Tries to acquire the lock without blocking.
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
StringRef str() const
Explicit conversion to StringRef.
Analysis pass providing the TargetTransformInfo.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
template class LLVM_TEMPLATE_ABI opt< bool >
template class LLVM_TEMPLATE_ABI opt< unsigned >
initializer< Ty > init(const Ty &Val)
template class LLVM_TEMPLATE_ABI opt< std::string >
LLVM_ABI void system_temp_directory(bool erasedOnReboot, SmallVectorImpl< char > &result)
Get the typical temporary directory for the system, e.g., "/var/tmp" or "C:/TEMP".
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
LLVM_ABI bool TimePassesIsEnabled
If the user specifies the -time-passes argument on an LLVM tool command line then the value of this b...
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
@ Success
The lock was released successfully.
@ OwnerDied
Owner died while holding the lock.
@ Timeout
Reached timeout while waiting for the owner to release the lock.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
LLVM_ABI std::unique_ptr< Module > CloneModule(const Module &M)
Return an exact copy of the specified module.
void consumeError(Error Err)
Consume a Error without doing anything.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static std::string getEdgeAttributes(const SplitGraph::Node *N, SplitGraphEdgeDstIterator EI, const SplitGraph &SG)
static std::string getGraphName(const SplitGraph &SG)
DOTGraphTraits(bool IsSimple=false)
static std::string getNodeAttributes(const SplitGraph::Node *N, const SplitGraph &SG)
static std::string getNodeDescription(const SplitGraph::Node *N, const SplitGraph &SG)
std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG)
DefaultDOTGraphTraits(bool simple=false)
const SplitGraph::Edge * EdgeRef
static NodeRef getEntryNode(NodeRef N)
SplitGraph::nodes_iterator nodes_iterator
SplitGraph::edges_iterator ChildEdgeIteratorType
SplitGraphEdgeDstIterator ChildIteratorType
static nodes_iterator nodes_end(const SplitGraph &G)
static ChildIteratorType child_begin(NodeRef Ref)
static nodes_iterator nodes_begin(const SplitGraph &G)
const SplitGraph::Node * NodeRef
static ChildIteratorType child_end(NodeRef Ref)