LLVM 22.0.0git
AMDGPUSplitModule.cpp
Go to the documentation of this file.
1//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file Implements a module splitting algorithm designed to support the
10/// FullLTO --lto-partitions option for parallel codegen.
11///
12/// The role of this module splitting pass is the same as
13/// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
14/// across a set of N partitions to allow for parallel codegen.
15///
16/// The similarities mostly end here, as this pass achieves load-balancing in a
17/// more elaborate fashion which is targeted towards AMDGPU modules. It can take
18/// advantage of the structure of AMDGPU modules (which are mostly
19/// self-contained) to allow for more efficient splitting without affecting
20/// codegen negatively, or causing innaccurate resource usage analysis.
21///
22/// High-level pass overview:
23/// - SplitGraph & associated classes
24/// - Graph representation of the module and of the dependencies that
25/// matter for splitting.
26/// - RecursiveSearchSplitting
27/// - Core splitting algorithm.
28/// - SplitProposal
29/// - Represents a suggested solution for splitting the input module. These
30/// solutions can be scored to determine the best one when multiple
31/// solutions are available.
32/// - Driver/pass "run" function glues everything together.
33
34#include "AMDGPUSplitModule.h"
35#include "AMDGPUTargetMachine.h"
37#include "llvm/ADT/DenseMap.h"
42#include "llvm/ADT/StringRef.h"
45#include "llvm/IR/Function.h"
47#include "llvm/IR/Instruction.h"
48#include "llvm/IR/Module.h"
49#include "llvm/IR/Value.h"
53#include "llvm/Support/Debug.h"
55#include "llvm/Support/Path.h"
56#include "llvm/Support/Timer.h"
59#include <cassert>
60#include <cmath>
61#include <memory>
62#include <utility>
63#include <vector>
64
65#ifndef NDEBUG
67#endif
68
69#define DEBUG_TYPE "amdgpu-split-module"
70
71namespace llvm {
72namespace {
73
74static cl::opt<unsigned> MaxDepth(
75 "amdgpu-module-splitting-max-depth",
76 cl::desc(
77 "maximum search depth. 0 forces a greedy approach. "
78 "warning: the algorithm is up to O(2^N), where N is the max depth."),
79 cl::init(8));
80
81static cl::opt<float> LargeFnFactor(
82 "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
83 cl::desc(
84 "when max depth is reached and we can no longer branch out, this "
85 "value determines if a function is worth merging into an already "
86 "existing partition to reduce code duplication. This is a factor "
87 "of the ideal partition size, e.g. 2.0 means we consider the "
88 "function for merging if its cost (including its callees) is 2x the "
89 "size of an ideal partition."));
90
91static cl::opt<float> LargeFnOverlapForMerge(
92 "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
93 cl::desc("when a function is considered for merging into a partition that "
94 "already contains some of its callees, do the merge if at least "
95 "n% of the code it can reach is already present inside the "
96 "partition; e.g. 0.7 means only merge >70%"));
97
98static cl::opt<bool> NoExternalizeGlobals(
99 "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
100 cl::desc("disables externalization of global variable with local linkage; "
101 "may cause globals to be duplicated which increases binary size"));
102
103static cl::opt<bool> NoExternalizeOnAddrTaken(
104 "amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden,
105 cl::desc(
106 "disables externalization of functions whose addresses are taken"));
107
109 ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
111 cl::desc("output file to write out the dotgraph "
112 "representation of the input module"));
113
114static cl::opt<std::string> PartitionSummariesOutput(
115 "amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
116 cl::desc("output file to write out a summary of "
117 "the partitions created for each module"));
118
119#ifndef NDEBUG
120static cl::opt<bool>
121 UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
122 cl::desc("use a lock file so only one process in the system "
123 "can run this pass at once. useful to avoid mangled "
124 "debug output in multithreaded environments."));
125
126static cl::opt<bool>
127 DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
129 cl::desc("print all proposals received and whether "
130 "they were rejected or accepted"));
131#endif
132
133struct SplitModuleTimer : NamedRegionTimer {
134 SplitModuleTimer(StringRef Name, StringRef Desc)
135 : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
137};
138
139//===----------------------------------------------------------------------===//
140// Utils
141//===----------------------------------------------------------------------===//
142
143using CostType = InstructionCost::CostType;
144using FunctionsCostMap = DenseMap<const Function *, CostType>;
145using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
146static constexpr unsigned InvalidPID = -1;
147
148/// \param Num numerator
149/// \param Dem denominator
150/// \returns a printable object to print (Num/Dem) using "%0.2f".
151static auto formatRatioOf(CostType Num, CostType Dem) {
152 CostType DemOr1 = Dem ? Dem : 1;
153 return format("%0.2f", (static_cast<double>(Num) / DemOr1) * 100);
154}
155
156/// Checks whether a given function is non-copyable.
157///
158/// Non-copyable functions cannot be cloned into multiple partitions, and only
159/// one copy of the function can be present across all partitions.
160///
161/// Kernel functions and external functions fall into this category. If we were
162/// to clone them, we would end up with multiple symbol definitions and a very
163/// unhappy linker.
164static bool isNonCopyable(const Function &F) {
165 return F.hasExternalLinkage() || !F.isDefinitionExact() ||
166 AMDGPU::isEntryFunctionCC(F.getCallingConv());
167}
168
169/// If \p GV has local linkage, make it external + hidden.
170static void externalize(GlobalValue &GV) {
171 if (GV.hasLocalLinkage()) {
172 GV.setLinkage(GlobalValue::ExternalLinkage);
173 GV.setVisibility(GlobalValue::HiddenVisibility);
174 }
175
176 // Unnamed entities must be named consistently between modules. setName will
177 // give a distinct name to each such entity.
178 if (!GV.hasName())
179 GV.setName("__llvmsplit_unnamed");
180}
181
182/// Cost analysis function. Calculates the cost of each function in \p M
183///
184/// \param GetTTI Abstract getter for TargetTransformInfo.
185/// \param M Module to analyze.
186/// \param CostMap[out] Resulting Function -> Cost map.
187/// \return The module's total cost.
188static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
189 FunctionsCostMap &CostMap) {
190 SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis");
191
192 LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
193 CostType ModuleCost = 0;
194 [[maybe_unused]] CostType KernelCost = 0;
195
196 for (auto &Fn : M) {
197 if (Fn.isDeclaration())
198 continue;
199
200 CostType FnCost = 0;
201 const auto &TTI = GetTTI(Fn);
202 for (const auto &BB : Fn) {
203 for (const auto &I : BB) {
204 auto Cost =
207 // Assume expensive if we can't tell the cost of an instruction.
208 CostType CostVal = Cost.isValid()
209 ? Cost.getValue()
211 assert((FnCost + CostVal) >= FnCost && "Overflow!");
212 FnCost += CostVal;
213 }
214 }
215
216 assert(FnCost != 0);
217
218 CostMap[&Fn] = FnCost;
219 assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
220 ModuleCost += FnCost;
221
222 if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv()))
223 KernelCost += FnCost;
224 }
225
226 if (CostMap.empty())
227 return 0;
228
229 assert(ModuleCost);
230 LLVM_DEBUG({
231 const CostType FnCost = ModuleCost - KernelCost;
232 dbgs() << " - total module cost is " << ModuleCost << ". kernels cost "
233 << "" << KernelCost << " ("
234 << format("%0.2f", (float(KernelCost) / ModuleCost) * 100)
235 << "% of the module), functions cost " << FnCost << " ("
236 << format("%0.2f", (float(FnCost) / ModuleCost) * 100)
237 << "% of the module)\n";
238 });
239
240 return ModuleCost;
241}
242
243/// \return true if \p F can be indirectly called
244static bool canBeIndirectlyCalled(const Function &F) {
245 if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()))
246 return false;
247 return !F.hasLocalLinkage() ||
248 F.hasAddressTaken(/*PutOffender=*/nullptr,
249 /*IgnoreCallbackUses=*/false,
250 /*IgnoreAssumeLikeCalls=*/true,
251 /*IgnoreLLVMUsed=*/true,
252 /*IgnoreARCAttachedCall=*/false,
253 /*IgnoreCastedDirectCall=*/true);
254}
255
256//===----------------------------------------------------------------------===//
257// Graph-based Module Representation
258//===----------------------------------------------------------------------===//
259
260/// AMDGPUSplitModule's view of the source Module, as a graph of all components
261/// that can be split into different modules.
262///
263/// The most trivial instance of this graph is just the CallGraph of the module,
264/// but it is not guaranteed that the graph is strictly equal to the CG. It
265/// currently always is but it's designed in a way that would eventually allow
266/// us to create abstract nodes, or nodes for different entities such as global
267/// variables or any other meaningful constraint we must consider.
268///
269/// The graph is only mutable by this class, and is generally not modified
270/// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
271/// mutate it.
272class SplitGraph {
273public:
274 class Node;
275
276 enum class EdgeKind : uint8_t {
277 /// The nodes are related through a direct call. This is a "strong" edge as
278 /// it means the Src will directly reference the Dst.
280 /// The nodes are related through an indirect call.
281 /// This is a "weaker" edge and is only considered when traversing the graph
282 /// starting from a kernel. We need this edge for resource usage analysis.
283 ///
284 /// The reason why we have this edge in the first place is due to how
285 /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
286 /// the resource usage of the kernel containing the indirect call is the
287 /// max resource usage of all functions that can be indirectly called.
289 };
290
291 /// An edge between two nodes. Edges are directional, and tagged with a
292 /// "kind".
293 struct Edge {
294 Edge(Node *Src, Node *Dst, EdgeKind Kind)
295 : Src(Src), Dst(Dst), Kind(Kind) {}
296
297 Node *Src; ///< Source
298 Node *Dst; ///< Destination
299 EdgeKind Kind;
300 };
301
302 using EdgesVec = SmallVector<const Edge *, 0>;
303 using edges_iterator = EdgesVec::const_iterator;
304 using nodes_iterator = const Node *const *;
305
306 SplitGraph(const Module &M, const FunctionsCostMap &CostMap,
307 CostType ModuleCost)
308 : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
309
310 void buildGraph(CallGraph &CG);
311
312#ifndef NDEBUG
313 bool verifyGraph() const;
314#endif
315
316 bool empty() const { return Nodes.empty(); }
317 iterator_range<nodes_iterator> nodes() const { return Nodes; }
318 const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
319
320 unsigned getNumNodes() const { return Nodes.size(); }
321 BitVector createNodesBitVector() const { return BitVector(Nodes.size()); }
322
323 const Module &getModule() const { return M; }
324
325 CostType getModuleCost() const { return ModuleCost; }
326 CostType getCost(const Function &F) const { return CostMap.at(&F); }
327
328 /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
329 /// IDs).
330 CostType calculateCost(const BitVector &BV) const;
331
332private:
333 /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
334 /// updates \p Cache.
335 Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache,
336 const GlobalValue &GV);
337
338 // Create a new edge between two nodes and add it to both nodes.
339 const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK);
340
341 const Module &M;
342 const FunctionsCostMap &CostMap;
343 CostType ModuleCost;
344
345 // Final list of nodes with stable ordering.
347
348 SpecificBumpPtrAllocator<Node> NodesPool;
349
350 // Edges are trivially destructible objects, so as a small optimization we
351 // use a BumpPtrAllocator which avoids destructor calls but also makes
352 // allocation faster.
353 static_assert(
354 std::is_trivially_destructible_v<Edge>,
355 "Edge must be trivially destructible to use the BumpPtrAllocator");
356 BumpPtrAllocator EdgesPool;
357};
358
359/// Nodes in the SplitGraph contain both incoming, and outgoing edges.
360/// Incoming edges have this node as their Dst, and Outgoing ones have this node
361/// as their Src.
362///
363/// Edge objects are shared by both nodes in Src/Dst. They provide immediate
364/// feedback on how two nodes are related, and in which direction they are
365/// related, which is valuable information to make splitting decisions.
366///
367/// Nodes are fundamentally abstract, and any consumers of the graph should
368/// treat them as such. While a node will be a function most of the time, we
369/// could also create nodes for any other reason. In the future, we could have
370/// single nodes for multiple functions, or nodes for GVs, etc.
371class SplitGraph::Node {
372 friend class SplitGraph;
373
374public:
375 Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
376 bool IsNonCopyable)
377 : ID(ID), GV(GV), IndividualCost(IndividualCost),
378 IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) {
379 if (auto *Fn = dyn_cast<Function>(&GV))
380 IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv());
381 }
382
383 /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
384 /// nodes in the graph. This ID can be used as an index in a BitVector.
385 unsigned getID() const { return ID; }
386
387 const Function &getFunction() const { return cast<Function>(GV); }
388
389 /// \returns the cost to import this component into a given module, not
390 /// accounting for any dependencies that may need to be imported as well.
391 CostType getIndividualCost() const { return IndividualCost; }
392
393 bool isNonCopyable() const { return IsNonCopyable; }
394 bool isEntryFunctionCC() const { return IsEntryFnCC; }
395
396 /// \returns whether this is an entry point in the graph. Entry points are
397 /// defined as follows: if you take all entry points in the graph, and iterate
398 /// their dependencies, you are guaranteed to visit all nodes in the graph at
399 /// least once.
400 bool isGraphEntryPoint() const { return IsGraphEntry; }
401
402 StringRef getName() const { return GV.getName(); }
403
404 bool hasAnyIncomingEdges() const { return IncomingEdges.size(); }
405 bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const {
406 return any_of(IncomingEdges, [&](const auto *E) { return E->Kind == EK; });
407 }
408
409 bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); }
410 bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const {
411 return any_of(OutgoingEdges, [&](const auto *E) { return E->Kind == EK; });
412 }
413
414 iterator_range<edges_iterator> incoming_edges() const {
415 return IncomingEdges;
416 }
417
418 iterator_range<edges_iterator> outgoing_edges() const {
419 return OutgoingEdges;
420 }
421
422 bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); }
423
424 /// Visit all children of this node in a recursive fashion. Also visits Self.
425 /// If \ref shouldFollowIndirectCalls returns false, then this only follows
426 /// DirectCall edges.
427 ///
428 /// \param Visitor Visitor Function.
429 void visitAllDependencies(std::function<void(const Node &)> Visitor) const;
430
431 /// Adds the depedencies of this node in \p BV by setting the bit
432 /// corresponding to each node.
433 ///
434 /// Implemented using \ref visitAllDependencies, hence it follows the same
435 /// rules regarding dependencies traversal.
436 ///
437 /// \param[out] BV The bitvector where the bits should be set.
438 void getDependencies(BitVector &BV) const {
439 visitAllDependencies([&](const Node &N) { BV.set(N.getID()); });
440 }
441
442private:
443 void markAsGraphEntry() { IsGraphEntry = true; }
444
445 unsigned ID;
446 const GlobalValue &GV;
447 CostType IndividualCost;
448 bool IsNonCopyable : 1;
449 bool IsEntryFnCC : 1;
450 bool IsGraphEntry : 1;
451
452 // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped
453 // together)
454 EdgesVec IncomingEdges;
455 EdgesVec OutgoingEdges;
456};
457
458void SplitGraph::Node::visitAllDependencies(
459 std::function<void(const Node &)> Visitor) const {
460 const bool FollowIndirect = shouldFollowIndirectCalls();
461 // FIXME: If this can access SplitGraph in the future, use a BitVector
462 // instead.
463 DenseSet<const Node *> Seen;
464 SmallVector<const Node *, 8> WorkList({this});
465 while (!WorkList.empty()) {
466 const Node *CurN = WorkList.pop_back_val();
467 if (auto [It, Inserted] = Seen.insert(CurN); !Inserted)
468 continue;
469
470 Visitor(*CurN);
471
472 for (const Edge *E : CurN->outgoing_edges()) {
473 if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall)
474 continue;
475 WorkList.push_back(E->Dst);
476 }
477 }
478}
479
480/// Checks if \p I has MD_callees and if it does, parse it and put the function
481/// in \p Callees.
482///
483/// \returns true if there was metadata and it was parsed correctly. false if
484/// there was no MD or if it contained unknown entries and parsing failed.
485/// If this returns false, \p Callees will contain incomplete information
486/// and must not be used.
487static bool handleCalleesMD(const Instruction &I,
488 SetVector<Function *> &Callees) {
489 auto *MD = I.getMetadata(LLVMContext::MD_callees);
490 if (!MD)
491 return false;
492
493 for (const auto &Op : MD->operands()) {
494 Function *Callee = mdconst::extract_or_null<Function>(Op);
495 if (!Callee)
496 return false;
497 Callees.insert(Callee);
498 }
499
500 return true;
501}
502
503void SplitGraph::buildGraph(CallGraph &CG) {
504 SplitModuleTimer SMT("buildGraph", "graph construction");
506 dbgs()
507 << "[build graph] constructing graph representation of the input\n");
508
509 // FIXME(?): Is the callgraph really worth using if we have to iterate the
510 // function again whenever it fails to give us enough information?
511
512 // We build the graph by just iterating all functions in the module and
513 // working on their direct callees. At the end, all nodes should be linked
514 // together as expected.
515 DenseMap<const GlobalValue *, Node *> Cache;
516 SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
517 for (const Function &Fn : M) {
518 if (Fn.isDeclaration())
519 continue;
520
521 // Look at direct callees and create the necessary edges in the graph.
522 SetVector<const Function *> DirectCallees;
523 bool CallsExternal = false;
524 for (auto &CGEntry : *CG[&Fn]) {
525 auto *CGNode = CGEntry.second;
526 if (auto *Callee = CGNode->getFunction()) {
527 if (!Callee->isDeclaration())
528 DirectCallees.insert(Callee);
529 } else if (CGNode == CG.getCallsExternalNode())
530 CallsExternal = true;
531 }
532
533 // Keep track of this function if it contains an indirect call and/or if it
534 // can be indirectly called.
535 if (CallsExternal) {
536 LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for ";
537 Fn.printAsOperand(dbgs());
538 dbgs() << " - analyzing function\n");
539
540 SetVector<Function *> KnownCallees;
541 bool HasUnknownIndirectCall = false;
542 for (const auto &Inst : instructions(Fn)) {
543 // look at all calls without a direct callee.
544 const auto *CB = dyn_cast<CallBase>(&Inst);
545 if (!CB || CB->getCalledFunction())
546 continue;
547
548 // inline assembly can be ignored, unless InlineAsmIsIndirectCall is
549 // true.
550 if (CB->isInlineAsm()) {
551 LLVM_DEBUG(dbgs() << " found inline assembly\n");
552 continue;
553 }
554
555 if (handleCalleesMD(Inst, KnownCallees))
556 continue;
557 // If we failed to parse any !callees MD, or some was missing,
558 // the entire KnownCallees list is now unreliable.
559 KnownCallees.clear();
560
561 // Everything else is handled conservatively. If we fall into the
562 // conservative case don't bother analyzing further.
563 HasUnknownIndirectCall = true;
564 break;
565 }
566
567 if (HasUnknownIndirectCall) {
568 LLVM_DEBUG(dbgs() << " indirect call found\n");
569 FnsWithIndirectCalls.push_back(&Fn);
570 } else if (!KnownCallees.empty())
571 DirectCallees.insert_range(KnownCallees);
572 }
573
574 Node &N = getNode(Cache, Fn);
575 for (const auto *Callee : DirectCallees)
576 createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
577
578 if (canBeIndirectlyCalled(Fn))
579 IndirectlyCallableFns.push_back(&Fn);
580 }
581
582 // Post-process functions with indirect calls.
583 for (const Function *Fn : FnsWithIndirectCalls) {
584 for (const Function *Candidate : IndirectlyCallableFns) {
585 Node &Src = getNode(Cache, *Fn);
586 Node &Dst = getNode(Cache, *Candidate);
587 createEdge(Src, Dst, EdgeKind::IndirectCall);
588 }
589 }
590
591 // Now, find all entry points.
592 SmallVector<Node *, 16> CandidateEntryPoints;
593 BitVector NodesReachableByKernels = createNodesBitVector();
594 for (Node *N : Nodes) {
595 // Functions with an Entry CC are always graph entry points too.
596 if (N->isEntryFunctionCC()) {
597 N->markAsGraphEntry();
598 N->getDependencies(NodesReachableByKernels);
599 } else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
600 CandidateEntryPoints.push_back(N);
601 }
602
603 for (Node *N : CandidateEntryPoints) {
604 // This can be another entry point if it's not reachable by a kernel
605 // TODO: We could sort all of the possible new entries in a stable order
606 // (e.g. by cost), then consume them one by one until
607 // NodesReachableByKernels is all 1s. It'd allow us to avoid
608 // considering some nodes as non-entries in some specific cases.
609 if (!NodesReachableByKernels.test(N->getID()))
610 N->markAsGraphEntry();
611 }
612
613#ifndef NDEBUG
614 assert(verifyGraph());
615#endif
616}
617
618#ifndef NDEBUG
619bool SplitGraph::verifyGraph() const {
620 unsigned ExpectedID = 0;
621 // Exceptionally using a set here in case IDs are messed up.
622 DenseSet<const Node *> SeenNodes;
623 DenseSet<const Function *> SeenFunctionNodes;
624 for (const Node *N : Nodes) {
625 if (N->getID() != (ExpectedID++)) {
626 errs() << "Node IDs are incorrect!\n";
627 return false;
628 }
629
630 if (!SeenNodes.insert(N).second) {
631 errs() << "Node seen more than once!\n";
632 return false;
633 }
634
635 if (&getNode(N->getID()) != N) {
636 errs() << "getNode doesn't return the right node\n";
637 return false;
638 }
639
640 for (const Edge *E : N->IncomingEdges) {
641 if (!E->Src || !E->Dst || (E->Dst != N) ||
642 (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
643 errs() << "ill-formed incoming edges\n";
644 return false;
645 }
646 }
647
648 for (const Edge *E : N->OutgoingEdges) {
649 if (!E->Src || !E->Dst || (E->Src != N) ||
650 (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
651 errs() << "ill-formed outgoing edges\n";
652 return false;
653 }
654 }
655
656 const Function &Fn = N->getFunction();
657 if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
658 if (N->hasAnyIncomingEdges()) {
659 errs() << "Kernels cannot have incoming edges\n";
660 return false;
661 }
662 }
663
664 if (Fn.isDeclaration()) {
665 errs() << "declarations shouldn't have nodes!\n";
666 return false;
667 }
668
669 auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
670 if (!Inserted) {
671 errs() << "one function has multiple nodes!\n";
672 return false;
673 }
674 }
675
676 if (ExpectedID != Nodes.size()) {
677 errs() << "Node IDs out of sync!\n";
678 return false;
679 }
680
681 if (createNodesBitVector().size() != getNumNodes()) {
682 errs() << "nodes bit vector doesn't have the right size!\n";
683 return false;
684 }
685
686 // Check we respect the promise of Node::isKernel
687 BitVector BV = createNodesBitVector();
688 for (const Node *N : nodes()) {
689 if (N->isGraphEntryPoint())
690 N->getDependencies(BV);
691 }
692
693 // Ensure each function in the module has an associated node.
694 for (const auto &Fn : M) {
695 if (!Fn.isDeclaration()) {
696 if (!SeenFunctionNodes.contains(&Fn)) {
697 errs() << "Fn has no associated node in the graph!\n";
698 return false;
699 }
700 }
701 }
702
703 if (!BV.all()) {
704 errs() << "not all nodes are reachable through the graph's entry points!\n";
705 return false;
706 }
707
708 return true;
709}
710#endif
711
712CostType SplitGraph::calculateCost(const BitVector &BV) const {
713 CostType Cost = 0;
714 for (unsigned NodeID : BV.set_bits())
715 Cost += getNode(NodeID).getIndividualCost();
716 return Cost;
717}
718
719SplitGraph::Node &
720SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
721 const GlobalValue &GV) {
722 auto &N = Cache[&GV];
723 if (N)
724 return *N;
725
726 CostType Cost = 0;
727 bool NonCopyable = false;
728 if (const Function *Fn = dyn_cast<Function>(&GV)) {
729 NonCopyable = isNonCopyable(*Fn);
730 Cost = CostMap.at(Fn);
731 }
732 N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable);
733 Nodes.push_back(N);
734 assert(&getNode(N->getID()) == N);
735 return *N;
736}
737
738const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
739 EdgeKind EK) {
740 const Edge *E = new (EdgesPool.Allocate<Edge>(1)) Edge(&Src, &Dst, EK);
741 Src.OutgoingEdges.push_back(E);
742 Dst.IncomingEdges.push_back(E);
743 return *E;
744}
745
746//===----------------------------------------------------------------------===//
747// Split Proposals
748//===----------------------------------------------------------------------===//
749
750/// Represents a module splitting proposal.
751///
752/// Proposals are made of N BitVectors, one for each partition, where each bit
753/// set indicates that the node is present and should be copied inside that
754/// partition.
755///
756/// Proposals have several metrics attached so they can be compared/sorted,
757/// which the driver to try multiple strategies resultings in multiple proposals
758/// and choose the best one out of them.
759class SplitProposal {
760public:
761 SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) {
762 Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
763 }
764
765 void setName(StringRef NewName) { Name = NewName; }
766 StringRef getName() const { return Name; }
767
768 const BitVector &operator[](unsigned PID) const {
769 return Partitions[PID].second;
770 }
771
772 void add(unsigned PID, const BitVector &BV) {
773 Partitions[PID].second |= BV;
774 updateScore(PID);
775 }
776
777 void print(raw_ostream &OS) const;
778 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
779
780 // Find the cheapest partition (lowest cost). In case of ties, always returns
781 // the highest partition number.
782 unsigned findCheapestPartition() const;
783
784 /// Calculate the CodeSize and Bottleneck scores.
785 void calculateScores();
786
787#ifndef NDEBUG
788 void verifyCompleteness() const;
789#endif
790
791 /// Only available after \ref calculateScores is called.
792 ///
793 /// A positive number indicating the % of code duplication that this proposal
794 /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by
795 /// duplicating some functions across partitions.
796 ///
797 /// Value is always rounded up to 3 decimal places.
798 ///
799 /// A perfect score would be 0.0, and anything approaching 1.0 is very bad.
800 double getCodeSizeScore() const { return CodeSizeScore; }
801
802 /// Only available after \ref calculateScores is called.
803 ///
804 /// A number between [0, 1] which indicates how big of a bottleneck is
805 /// expected from the largest partition.
806 ///
807 /// A score of 1.0 means the biggest partition is as big as the source module,
808 /// so build time will be equal to or greater than the build time of the
809 /// initial input.
810 ///
811 /// Value is always rounded up to 3 decimal places.
812 ///
813 /// This is one of the metrics used to estimate this proposal's build time.
814 double getBottleneckScore() const { return BottleneckScore; }
815
816private:
817 void updateScore(unsigned PID) {
818 assert(SG);
819 for (auto &[PCost, Nodes] : Partitions) {
820 TotalCost -= PCost;
821 PCost = SG->calculateCost(Nodes);
822 TotalCost += PCost;
823 }
824 }
825
826 /// \see getCodeSizeScore
827 double CodeSizeScore = 0.0;
828 /// \see getBottleneckScore
829 double BottleneckScore = 0.0;
830 /// Aggregated cost of all partitions
831 CostType TotalCost = 0;
832
833 const SplitGraph *SG = nullptr;
834 std::string Name;
835
836 std::vector<std::pair<CostType, BitVector>> Partitions;
837};
838
839void SplitProposal::print(raw_ostream &OS) const {
840 assert(SG);
841
842 OS << "[proposal] " << Name << ", total cost:" << TotalCost
843 << ", code size score:" << format("%0.3f", CodeSizeScore)
844 << ", bottleneck score:" << format("%0.3f", BottleneckScore) << '\n';
845 for (const auto &[PID, Part] : enumerate(Partitions)) {
846 const auto &[Cost, NodeIDs] = Part;
847 OS << " - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost
848 << '|' << formatRatioOf(Cost, SG->getModuleCost()) << "%\n";
849 }
850}
851
852unsigned SplitProposal::findCheapestPartition() const {
853 assert(!Partitions.empty());
854 CostType CurCost = std::numeric_limits<CostType>::max();
855 unsigned CurPID = InvalidPID;
856 for (const auto &[Idx, Part] : enumerate(Partitions)) {
857 if (Part.first <= CurCost) {
858 CurPID = Idx;
859 CurCost = Part.first;
860 }
861 }
862 assert(CurPID != InvalidPID);
863 return CurPID;
864}
865
866void SplitProposal::calculateScores() {
867 if (Partitions.empty())
868 return;
869
870 assert(SG);
871 CostType LargestPCost = 0;
872 for (auto &[PCost, Nodes] : Partitions) {
873 if (PCost > LargestPCost)
874 LargestPCost = PCost;
875 }
876
877 CostType ModuleCost = SG->getModuleCost();
878 CodeSizeScore = double(TotalCost) / ModuleCost;
879 assert(CodeSizeScore >= 0.0);
880
881 BottleneckScore = double(LargestPCost) / ModuleCost;
882
883 CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
884 BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
885}
886
887#ifndef NDEBUG
888void SplitProposal::verifyCompleteness() const {
889 if (Partitions.empty())
890 return;
891
892 BitVector Result = Partitions[0].second;
893 for (const auto &P : drop_begin(Partitions))
894 Result |= P.second;
895 assert(Result.all() && "some nodes are missing from this proposal!");
896}
897#endif
898
899//===-- RecursiveSearchStrategy -------------------------------------------===//
900
901/// Partitioning algorithm.
902///
903/// This is a recursive search algorithm that can explore multiple possiblities.
904///
905/// When a cluster of nodes can go into more than one partition, and we haven't
906/// reached maximum search depth, we recurse and explore both options and their
907/// consequences. Both branches will yield a proposal, and the driver will grade
908/// both and choose the best one.
909///
910/// If max depth is reached, we will use some heuristics to make a choice. Most
911/// of the time we will just use the least-pressured (cheapest) partition, but
912/// if a cluster is particularly big and there is a good amount of overlap with
913/// an existing partition, we will choose that partition instead.
914class RecursiveSearchSplitting {
915public:
916 using SubmitProposalFn = function_ref<void(SplitProposal)>;
917
918 RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts,
919 SubmitProposalFn SubmitProposal);
920
921 void run();
922
923private:
924 struct WorkListEntry {
925 WorkListEntry(const BitVector &BV) : Cluster(BV) {}
926
927 unsigned NumNonEntryNodes = 0;
928 CostType TotalCost = 0;
929 CostType CostExcludingGraphEntryPoints = 0;
930 BitVector Cluster;
931 };
932
933 /// Collects all graph entry points's clusters and sort them so the most
934 /// expensive clusters are viewed first. This will merge clusters together if
935 /// they share a non-copyable dependency.
936 void setupWorkList();
937
938 /// Recursive function that assigns the worklist item at \p Idx into a
939 /// partition of \p SP.
940 ///
941 /// \p Depth is the current search depth. When this value is equal to
942 /// \ref MaxDepth, we can no longer recurse.
943 ///
944 /// This function only recurses if there is more than one possible assignment,
945 /// otherwise it is iterative to avoid creating a call stack that is as big as
946 /// \ref WorkList.
947 void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP);
948
949 /// \return A pair: first element is the PID of the partition that has the
950 /// most similarities with \p Entry, or \ref InvalidPID if no partition was
951 /// found with at least one element in common. The second element is the
952 /// aggregated cost of all dependencies in common between \p Entry and that
953 /// partition.
954 std::pair<unsigned, CostType>
955 findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP);
956
957 const SplitGraph &SG;
958 unsigned NumParts;
959 SubmitProposalFn SubmitProposal;
960
961 // A Cluster is considered large when its cost, excluding entry points,
962 // exceeds this value.
963 CostType LargeClusterThreshold = 0;
964 unsigned NumProposalsSubmitted = 0;
965 SmallVector<WorkListEntry> WorkList;
966};
967
968RecursiveSearchSplitting::RecursiveSearchSplitting(
969 const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
970 : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
971 // arbitrary max value as a safeguard. Anything above 10 will already be
972 // slow, this is just a max value to prevent extreme resource exhaustion or
973 // unbounded run time.
974 if (MaxDepth > 16)
975 report_fatal_error("[amdgpu-split-module] search depth of " +
976 Twine(MaxDepth) + " is too high!");
977 LargeClusterThreshold =
978 (LargeFnFactor != 0.0)
979 ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
980 : std::numeric_limits<CostType>::max();
981 LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at "
982 << LargeClusterThreshold << "\n");
983}
984
985void RecursiveSearchSplitting::run() {
986 {
987 SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist");
988 setupWorkList();
989 }
990
991 {
992 SplitModuleTimer SMT("recursive_search_pick", "partitioning");
993 SplitProposal SP(SG, NumParts);
994 pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP);
995 }
996}
997
998void RecursiveSearchSplitting::setupWorkList() {
999 // e.g. if A and B are two worklist item, and they both call a non copyable
1000 // dependency C, this does:
1001 // A=C
1002 // B=C
1003 // => NodeEC will create a single group (A, B, C) and we create a new
1004 // WorkList entry for that group.
1005
1006 EquivalenceClasses<unsigned> NodeEC;
1007 for (const SplitGraph::Node *N : SG.nodes()) {
1008 if (!N->isGraphEntryPoint())
1009 continue;
1010
1011 NodeEC.insert(N->getID());
1012 N->visitAllDependencies([&](const SplitGraph::Node &Dep) {
1013 if (&Dep != N && Dep.isNonCopyable())
1014 NodeEC.unionSets(N->getID(), Dep.getID());
1015 });
1016 }
1017
1018 for (const auto &Node : NodeEC) {
1019 if (!Node->isLeader())
1020 continue;
1021
1022 BitVector Cluster = SG.createNodesBitVector();
1023 for (unsigned M : NodeEC.members(*Node)) {
1024 const SplitGraph::Node &N = SG.getNode(M);
1025 if (N.isGraphEntryPoint())
1026 N.getDependencies(Cluster);
1027 }
1028 WorkList.emplace_back(std::move(Cluster));
1029 }
1030
1031 // Calculate costs and other useful information.
1032 for (WorkListEntry &Entry : WorkList) {
1033 for (unsigned NodeID : Entry.Cluster.set_bits()) {
1034 const SplitGraph::Node &N = SG.getNode(NodeID);
1035 const CostType Cost = N.getIndividualCost();
1036
1037 Entry.TotalCost += Cost;
1038 if (!N.isGraphEntryPoint()) {
1039 Entry.CostExcludingGraphEntryPoints += Cost;
1040 ++Entry.NumNonEntryNodes;
1041 }
1042 }
1043 }
1044
1045 stable_sort(WorkList, [](const WorkListEntry &A, const WorkListEntry &B) {
1046 if (A.TotalCost != B.TotalCost)
1047 return A.TotalCost > B.TotalCost;
1048
1049 if (A.CostExcludingGraphEntryPoints != B.CostExcludingGraphEntryPoints)
1050 return A.CostExcludingGraphEntryPoints > B.CostExcludingGraphEntryPoints;
1051
1052 if (A.NumNonEntryNodes != B.NumNonEntryNodes)
1053 return A.NumNonEntryNodes > B.NumNonEntryNodes;
1054
1055 return A.Cluster.count() > B.Cluster.count();
1056 });
1057
1058 LLVM_DEBUG({
1059 dbgs() << "[recursive search] worklist:\n";
1060 for (const auto &[Idx, Entry] : enumerate(WorkList)) {
1061 dbgs() << " - [" << Idx << "]: ";
1062 for (unsigned NodeID : Entry.Cluster.set_bits())
1063 dbgs() << NodeID << " ";
1064 dbgs() << "(total_cost:" << Entry.TotalCost
1065 << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
1066 << ")\n";
1067 }
1068 });
1069}
1070
1071void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
1072 SplitProposal SP) {
1073 while (Idx < WorkList.size()) {
1074 // Step 1: Determine candidate PIDs.
1075 //
1076 const WorkListEntry &Entry = WorkList[Idx];
1077 const BitVector &Cluster = Entry.Cluster;
1078
1079 // Default option is to do load-balancing, AKA assign to least pressured
1080 // partition.
1081 const unsigned CheapestPID = SP.findCheapestPartition();
1082 assert(CheapestPID != InvalidPID);
1083
1084 // Explore assigning to the kernel that contains the most dependencies in
1085 // common.
1086 const auto [MostSimilarPID, SimilarDepsCost] =
1087 findMostSimilarPartition(Entry, SP);
1088
1089 // We can chose to explore only one path if we only have one valid path, or
1090 // if we reached maximum search depth and can no longer branch out.
1091 unsigned SinglePIDToTry = InvalidPID;
1092 if (MostSimilarPID == InvalidPID) // no similar PID found
1093 SinglePIDToTry = CheapestPID;
1094 else if (MostSimilarPID == CheapestPID) // both landed on the same PID
1095 SinglePIDToTry = CheapestPID;
1096 else if (Depth >= MaxDepth) {
1097 // We have to choose one path. Use a heuristic to guess which one will be
1098 // more appropriate.
1099 if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
1100 // Check if the amount of code in common makes it worth it.
1101 assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
1102 const double Ratio = static_cast<double>(SimilarDepsCost) /
1103 Entry.CostExcludingGraphEntryPoints;
1104 assert(Ratio >= 0.0 && Ratio <= 1.0);
1105 if (Ratio > LargeFnOverlapForMerge) {
1106 // For debug, just print "L", so we'll see "L3=P3" for instance, which
1107 // will mean we reached max depth and chose P3 based on this
1108 // heuristic.
1109 LLVM_DEBUG(dbgs() << 'L');
1110 SinglePIDToTry = MostSimilarPID;
1111 }
1112 } else
1113 SinglePIDToTry = CheapestPID;
1114 }
1115
1116 // Step 2: Explore candidates.
1117
1118 // When we only explore one possible path, and thus branch depth doesn't
1119 // increase, do not recurse, iterate instead.
1120 if (SinglePIDToTry != InvalidPID) {
1121 LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' ');
1122 // Only one path to explore, don't clone SP, don't increase depth.
1123 SP.add(SinglePIDToTry, Cluster);
1124 ++Idx;
1125 continue;
1126 }
1127
1128 assert(MostSimilarPID != InvalidPID);
1129
1130 // We explore multiple paths: recurse at increased depth, then stop this
1131 // function.
1132
1133 LLVM_DEBUG(dbgs() << '\n');
1134
1135 // lb = load balancing = put in cheapest partition
1136 {
1137 SplitProposal BranchSP = SP;
1138 LLVM_DEBUG(dbgs().indent(Depth)
1139 << " [lb] " << Idx << "=P" << CheapestPID << "? ");
1140 BranchSP.add(CheapestPID, Cluster);
1141 pickPartition(Depth + 1, Idx + 1, BranchSP);
1142 }
1143
1144 // ms = most similar = put in partition with the most in common
1145 {
1146 SplitProposal BranchSP = SP;
1147 LLVM_DEBUG(dbgs().indent(Depth)
1148 << " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
1149 BranchSP.add(MostSimilarPID, Cluster);
1150 pickPartition(Depth + 1, Idx + 1, BranchSP);
1151 }
1152
1153 return;
1154 }
1155
1156 // Step 3: If we assigned all WorkList items, submit the proposal.
1157
1158 assert(Idx == WorkList.size());
1159 assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
1160 "Search got out of bounds?");
1161 SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" +
1162 std::to_string(NumProposalsSubmitted++));
1163 LLVM_DEBUG(dbgs() << '\n');
1164 SubmitProposal(SP);
1165}
1166
1167std::pair<unsigned, CostType>
1168RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
1169 const SplitProposal &SP) {
1170 if (!Entry.NumNonEntryNodes)
1171 return {InvalidPID, 0};
1172
1173 // We take the partition that is the most similar using Cost as a metric.
1174 // So we take the set of nodes in common, compute their aggregated cost, and
1175 // pick the partition with the highest cost in common.
1176 unsigned ChosenPID = InvalidPID;
1177 CostType ChosenCost = 0;
1178 for (unsigned PID = 0; PID < NumParts; ++PID) {
1179 BitVector BV = SP[PID];
1180 BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?!
1181
1182 if (BV.none())
1183 continue;
1184
1185 const CostType Cost = SG.calculateCost(BV);
1186
1187 if (ChosenPID == InvalidPID || ChosenCost < Cost ||
1188 (ChosenCost == Cost && PID > ChosenPID)) {
1189 ChosenPID = PID;
1190 ChosenCost = Cost;
1191 }
1192 }
1193
1194 return {ChosenPID, ChosenCost};
1195}
1196
1197//===----------------------------------------------------------------------===//
1198// DOTGraph Printing Support
1199//===----------------------------------------------------------------------===//
1200
1201const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) {
1202 return E->Dst;
1203}
1204
1205using SplitGraphEdgeDstIterator =
1206 mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>;
1207
1208} // namespace
1209
1210template <> struct GraphTraits<SplitGraph> {
1211 using NodeRef = const SplitGraph::Node *;
1212 using nodes_iterator = SplitGraph::nodes_iterator;
1213 using ChildIteratorType = SplitGraphEdgeDstIterator;
1214
1215 using EdgeRef = const SplitGraph::Edge *;
1216 using ChildEdgeIteratorType = SplitGraph::edges_iterator;
1217
1218 static NodeRef getEntryNode(NodeRef N) { return N; }
1219
1221 return {Ref->outgoing_edges().begin(), mapEdgeToDst};
1222 }
1224 return {Ref->outgoing_edges().end(), mapEdgeToDst};
1225 }
1226
1227 static nodes_iterator nodes_begin(const SplitGraph &G) {
1228 return G.nodes().begin();
1229 }
1230 static nodes_iterator nodes_end(const SplitGraph &G) {
1231 return G.nodes().end();
1232 }
1233};
1234
1235template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits {
1236 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
1237
1238 static std::string getGraphName(const SplitGraph &SG) {
1239 return SG.getModule().getName().str();
1240 }
1241
1242 std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) {
1243 return N->getName().str();
1244 }
1245
1246 static std::string getNodeDescription(const SplitGraph::Node *N,
1247 const SplitGraph &SG) {
1248 std::string Result;
1249 if (N->isEntryFunctionCC())
1250 Result += "entry-fn-cc ";
1251 if (N->isNonCopyable())
1252 Result += "non-copyable ";
1253 Result += "cost:" + std::to_string(N->getIndividualCost());
1254 return Result;
1255 }
1256
1257 static std::string getNodeAttributes(const SplitGraph::Node *N,
1258 const SplitGraph &SG) {
1259 return N->hasAnyIncomingEdges() ? "" : "color=\"red\"";
1260 }
1261
1262 static std::string getEdgeAttributes(const SplitGraph::Node *N,
1263 SplitGraphEdgeDstIterator EI,
1264 const SplitGraph &SG) {
1265
1266 switch ((*EI.getCurrent())->Kind) {
1267 case SplitGraph::EdgeKind::DirectCall:
1268 return "";
1269 case SplitGraph::EdgeKind::IndirectCall:
1270 return "style=\"dashed\"";
1271 }
1272 llvm_unreachable("Unknown SplitGraph::EdgeKind enum");
1273 }
1274};
1275
1276//===----------------------------------------------------------------------===//
1277// Driver
1278//===----------------------------------------------------------------------===//
1279
1280namespace {
1281
1282// If we didn't externalize GVs, then local GVs need to be conservatively
1283// imported into every module (including their initializers), and then cleaned
1284// up afterwards.
1285static bool needsConservativeImport(const GlobalValue *GV) {
1286 if (const auto *Var = dyn_cast<GlobalVariable>(GV))
1287 return Var->hasLocalLinkage();
1288 return isa<GlobalAlias>(GV);
1289}
1290
1291/// Prints a summary of the partition \p N, represented by module \p M, to \p
1292/// OS.
1293static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
1294 unsigned PartCost, unsigned ModuleCost) {
1295 OS << "*** Partition P" << N << " ***\n";
1296
1297 for (const auto &Fn : M) {
1298 if (!Fn.isDeclaration())
1299 OS << " - [function] " << Fn.getName() << "\n";
1300 }
1301
1302 for (const auto &GV : M.globals()) {
1303 if (GV.hasInitializer())
1304 OS << " - [global] " << GV.getName() << "\n";
1305 }
1306
1307 OS << "Partition contains " << formatRatioOf(PartCost, ModuleCost)
1308 << "% of the source\n";
1309}
1310
1311static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
1312 SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm");
1313
1314 LLVM_DEBUG({
1315 New.verifyCompleteness();
1316 if (DebugProposalSearch)
1317 New.print(dbgs());
1318 });
1319
1320 const double CurBScore = Best.getBottleneckScore();
1321 const double CurCSScore = Best.getCodeSizeScore();
1322 const double NewBScore = New.getBottleneckScore();
1323 const double NewCSScore = New.getCodeSizeScore();
1324
1325 // TODO: Improve this
1326 // We can probably lower the precision of the comparison at first
1327 // e.g. if we have
1328 // - (Current): BScore: 0.489 CSCore 1.105
1329 // - (New): BScore: 0.475 CSCore 1.305
1330 // Currently we'd choose the new one because the bottleneck score is
1331 // lower, but the new one duplicates more code. It may be worth it to
1332 // discard the new proposal as the impact on build time is negligible.
1333
1334 // Compare them
1335 bool IsBest = false;
1336 if (NewBScore < CurBScore)
1337 IsBest = true;
1338 else if (NewBScore == CurBScore)
1339 IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker.
1340
1341 if (IsBest)
1342 Best = std::move(New);
1343
1344 LLVM_DEBUG(if (DebugProposalSearch) {
1345 if (IsBest)
1346 dbgs() << "[search] new best proposal!\n";
1347 else
1348 dbgs() << "[search] discarding - not profitable\n";
1349 });
1350}
1351
1352/// Trivial helper to create an identical copy of \p M.
1353static std::unique_ptr<Module> cloneAll(const Module &M) {
1354 ValueToValueMapTy VMap;
1355 return CloneModule(M, VMap, [&](const GlobalValue *GV) { return true; });
1356}
1357
1358/// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
1359static void writeDOTGraph(const SplitGraph &SG) {
1360 if (ModuleDotCfgOutput.empty())
1361 return;
1362
1363 std::error_code EC;
1364 raw_fd_ostream OS(ModuleDotCfgOutput, EC);
1365 if (EC) {
1366 errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
1367 << "' - DOTGraph will not be printed\n";
1368 }
1369 WriteGraph(OS, SG, /*ShortName=*/false,
1370 /*Title=*/SG.getModule().getName());
1371}
1372
1373static void splitAMDGPUModule(
1374 GetTTIFn GetTTI, Module &M, unsigned NumParts,
1375 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1376 CallGraph CG(M);
1377
1378 // Externalize functions whose address are taken.
1379 //
1380 // This is needed because partitioning is purely based on calls, but sometimes
1381 // a kernel/function may just look at the address of another local function
1382 // and not do anything (no calls). After partitioning, that local function may
1383 // end up in a different module (so it's just a declaration in the module
1384 // where its address is taken), which emits a "undefined hidden symbol" linker
1385 // error.
1386 //
1387 // Additionally, it guides partitioning to not duplicate this function if it's
1388 // called directly at some point.
1389 //
1390 // TODO: Could we be smarter about this ? This makes all functions whose
1391 // addresses are taken non-copyable. We should probably model this type of
1392 // constraint in the graph and use it to guide splitting, instead of
1393 // externalizing like this. Maybe non-copyable should really mean "keep one
1394 // visible copy, then internalize all other copies" for some functions?
1395 if (!NoExternalizeOnAddrTaken) {
1396 for (auto &Fn : M) {
1397 // TODO: Should aliases count? Probably not but they're so rare I'm not
1398 // sure it's worth fixing.
1399 if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
1400 LLVM_DEBUG(dbgs() << "[externalize] "; Fn.printAsOperand(dbgs());
1401 dbgs() << " because its address is taken\n");
1402 externalize(Fn);
1403 }
1404 }
1405 }
1406
1407 // Externalize local GVs, which avoids duplicating their initializers, which
1408 // in turns helps keep code size in check.
1409 if (!NoExternalizeGlobals) {
1410 for (auto &GV : M.globals()) {
1411 if (GV.hasLocalLinkage())
1412 LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n');
1413 externalize(GV);
1414 }
1415 }
1416
1417 // Start by calculating the cost of every function in the module, as well as
1418 // the module's overall cost.
1419 FunctionsCostMap FnCosts;
1420 const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
1421
1422 // Build the SplitGraph, which represents the module's functions and models
1423 // their dependencies accurately.
1424 SplitGraph SG(M, FnCosts, ModuleCost);
1425 SG.buildGraph(CG);
1426
1427 if (SG.empty()) {
1428 LLVM_DEBUG(
1429 dbgs()
1430 << "[!] no nodes in graph, input is empty - no splitting possible\n");
1431 ModuleCallback(cloneAll(M));
1432 return;
1433 }
1434
1435 LLVM_DEBUG({
1436 dbgs() << "[graph] nodes:\n";
1437 for (const SplitGraph::Node *N : SG.nodes()) {
1438 dbgs() << " - [" << N->getID() << "]: " << N->getName() << " "
1439 << (N->isGraphEntryPoint() ? "(entry)" : "") << " "
1440 << (N->isNonCopyable() ? "(noncopyable)" : "") << "\n";
1441 }
1442 });
1443
1444 writeDOTGraph(SG);
1445
1446 LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
1447
1448 std::optional<SplitProposal> Proposal;
1449 const auto EvaluateProposal = [&](SplitProposal SP) {
1450 SP.calculateScores();
1451 if (!Proposal)
1452 Proposal = std::move(SP);
1453 else
1454 evaluateProposal(*Proposal, std::move(SP));
1455 };
1456
1457 // TODO: It would be very easy to create new strategies by just adding a base
1458 // class to RecursiveSearchSplitting and abstracting it away.
1459 RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
1460 LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
1461 << Proposal->getName() << "\n";);
1462
1463 if (!Proposal) {
1464 LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
1465 ModuleCallback(cloneAll(M));
1466 return;
1467 }
1468
1469 LLVM_DEBUG(Proposal->print(dbgs()););
1470
1471 std::optional<raw_fd_ostream> SummariesOS;
1472 if (!PartitionSummariesOutput.empty()) {
1473 std::error_code EC;
1474 SummariesOS.emplace(PartitionSummariesOutput, EC);
1475 if (EC)
1476 errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
1477 << "' - Partition summaries will not be printed\n";
1478 }
1479
1480 // One module will import all GlobalValues that are not Functions
1481 // and are not subject to conservative import.
1482 bool ImportAllGVs = true;
1483
1484 for (unsigned PID = 0; PID < NumParts; ++PID) {
1485 SplitModuleTimer SMT2("modules_creation",
1486 "creating modules for each partition");
1487 LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
1488
1489 DenseSet<const Function *> FnsInPart;
1490 for (unsigned NodeID : (*Proposal)[PID].set_bits())
1491 FnsInPart.insert(&SG.getNode(NodeID).getFunction());
1492
1493 // Don't create empty modules.
1494 if (FnsInPart.empty()) {
1495 LLVM_DEBUG(dbgs() << "[split] P" << PID
1496 << " is empty, not creating module\n");
1497 continue;
1498 }
1499
1500 ValueToValueMapTy VMap;
1501 CostType PartCost = 0;
1502 std::unique_ptr<Module> MPart(
1503 CloneModule(M, VMap, [&](const GlobalValue *GV) {
1504 // Functions go in their assigned partition.
1505 if (const auto *Fn = dyn_cast<Function>(GV)) {
1506 if (FnsInPart.contains(Fn)) {
1507 PartCost += SG.getCost(*Fn);
1508 return true;
1509 }
1510 return false;
1511 }
1512
1513 // Everything else goes in the first non-empty module we create.
1514 return ImportAllGVs || needsConservativeImport(GV);
1515 }));
1516
1517 ImportAllGVs = false;
1518
1519 // FIXME: Aliases aren't seen often, and their handling isn't perfect so
1520 // bugs are possible.
1521
1522 // Clean-up conservatively imported GVs without any users.
1523 for (auto &GV : make_early_inc_range(MPart->global_values())) {
1524 if (needsConservativeImport(&GV) && GV.use_empty())
1525 GV.eraseFromParent();
1526 }
1527
1528 if (SummariesOS)
1529 printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
1530
1531 LLVM_DEBUG(
1532 printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));
1533
1534 ModuleCallback(std::move(MPart));
1535 }
1536}
1537} // namespace
1538
1541 SplitModuleTimer SMT(
1542 "total", "total pass runtime (incl. potentially waiting for lockfile)");
1543
1545 MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1546 const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
1547 return FAM.getResult<TargetIRAnalysis>(F);
1548 };
1549
1550 bool Done = false;
1551#ifndef NDEBUG
1552 if (UseLockFile) {
1553 SmallString<128> LockFilePath;
1554 sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath);
1555 sys::path::append(LockFilePath, "amdgpu-split-module-debug");
1556 LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
1557 << "'\n");
1558
1559 while (true) {
1560 llvm::LockFileManager Lock(LockFilePath.str());
1561 bool Owned;
1562 if (Error Err = Lock.tryLock().moveInto(Owned)) {
1563 consumeError(std::move(Err));
1564 LLVM_DEBUG(
1565 dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
1566 "output may be mangled by other processes\n");
1567 } else if (!Owned) {
1568 switch (Lock.waitForUnlockFor(std::chrono::seconds(90))) {
1570 break;
1572 continue; // try again to get the lock.
1574 LLVM_DEBUG(
1575 dbgs()
1576 << "[amdgpu-split-module] unable to acquire lockfile, debug "
1577 "output may be mangled by other processes\n");
1578 Lock.unsafeMaybeUnlock();
1579 break; // give up
1580 }
1581 }
1582
1583 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
1584 Done = true;
1585 break;
1586 }
1587 }
1588#endif
1589
1590 if (!Done)
1591 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
1592
1593 // We can change linkage/visibilities in the input, consider that nothing is
1594 // preserved just to be safe. This pass runs last anyway.
1595 return PreservedAnalyses::none();
1596}
1597} // namespace llvm
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
The AMDGPU TargetMachine interface definition for hw codegen targets.
Unify divergent function exit nodes
This file defines the BumpPtrAllocator interface.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI, TargetLibraryInfo &TLI)
Definition CostModel.cpp:74
This file defines the DenseMap class.
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
#define DEBUG_TYPE
This file defines the little GraphTraits<X> template class that should be specialized by classes that...
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Machine Check Debug Module
#define P(N)
FunctionAnalysisManager FAM
ModuleAnalysisManager MAM
static StringRef getName(Value *V)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines the SmallVector class.
static void externalize(GlobalValue *GV)
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const BasicSubtargetSubTypeKV * find(StringRef S, ArrayRef< BasicSubtargetSubTypeKV > A)
Find KV in array using binary search.
This pass exposes codegen information to IR-level passes.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM)
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ExternalLinkage
Externally visible function.
Definition GlobalValue.h:53
static InstructionCost getMax()
Class that manages the creation of a lock file to aid implicit coordination between different process...
std::error_code unsafeMaybeUnlock() override
Remove the lock file.
WaitForUnlockResult waitForUnlockFor(std::chrono::seconds MaxSeconds) override
For a shared lock, wait until the owner releases the lock.
Expected< bool > tryLock() override
Tries to acquire the lock without blocking.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
StringRef str() const
Explicit conversion to StringRef.
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ TCK_CodeSize
Instruction code size.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
template class LLVM_TEMPLATE_ABI opt< bool >
template class LLVM_TEMPLATE_ABI opt< unsigned >
initializer< Ty > init(const Ty &Val)
template class LLVM_TEMPLATE_ABI opt< std::string >
LLVM_ABI void system_temp_directory(bool erasedOnReboot, SmallVectorImpl< char > &result)
Get the typical temporary directory for the system, e.g., "/var/tmp" or "C:/TEMP".
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition Path.cpp:456
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
LLVM_ABI bool TimePassesIsEnabled
If the user specifies the -time-passes argument on an LLVM tool command line then the value of this b...
Definition IRReader.cpp:26
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Op::Description Desc
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Success
The lock was released successfully.
@ OwnerDied
Owner died while holding the lock.
@ Timeout
Reached timeout while waiting for the owner to release the lock.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
TargetTransformInfo TTI
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
Definition Allocator.h:383
LLVM_ABI std::unique_ptr< Module > CloneModule(const Module &M)
Return an exact copy of the specified module.
void consumeError(Error Err)
Consume a Error without doing anything.
Definition Error.h:1083
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
#define N
static std::string getEdgeAttributes(const SplitGraph::Node *N, SplitGraphEdgeDstIterator EI, const SplitGraph &SG)
static std::string getGraphName(const SplitGraph &SG)
static std::string getNodeAttributes(const SplitGraph::Node *N, const SplitGraph &SG)
static std::string getNodeDescription(const SplitGraph::Node *N, const SplitGraph &SG)
std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG)
DefaultDOTGraphTraits(bool simple=false)
static NodeRef getEntryNode(NodeRef N)
SplitGraph::nodes_iterator nodes_iterator
SplitGraph::edges_iterator ChildEdgeIteratorType
SplitGraphEdgeDstIterator ChildIteratorType
static nodes_iterator nodes_end(const SplitGraph &G)
static ChildIteratorType child_begin(NodeRef Ref)
static nodes_iterator nodes_begin(const SplitGraph &G)
static ChildIteratorType child_end(NodeRef Ref)