LLVM 22.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
44#include "llvm/Support/SHA1.h"
46#include "llvm/Transforms/IPO.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110
112 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
113 cl::value_desc("filename"),
114 cl::desc("Specify the path prefix of the MemProf dot files."));
115
116static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
118 cl::desc("Export graph to dot files."));
119
120// TODO: Remove this option once new handling is validated more widely.
122 "memprof-merge-iteration", cl::init(true), cl::Hidden,
123 cl::desc("Iteratively apply merging on a node to catch new callers"));
124
125// How much of the graph to export to dot.
127 All, // The full CCG graph.
128 Alloc, // Only contexts for the specified allocation.
129 Context, // Only the specified context.
130};
131
133 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
136 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
138 "Export only nodes with contexts feeding given "
139 "-memprof-dot-alloc-id"),
140 clEnumValN(DotScope::Context, "context",
141 "Export only nodes with given -memprof-dot-context-id")));
142
144 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
145 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
146 "or to highlight if -memprof-dot-scope=all"));
147
149 "memprof-dot-context-id", cl::init(0), cl::Hidden,
150 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
151 "highlight otherwise"));
152
153static cl::opt<bool>
154 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
155 cl::desc("Dump CallingContextGraph to stdout after each stage."));
156
157static cl::opt<bool>
158 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
159 cl::desc("Perform verification checks on CallingContextGraph."));
160
161static cl::opt<bool>
162 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
163 cl::desc("Perform frequent verification checks on nodes."));
164
166 "memprof-import-summary",
167 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
168 cl::Hidden);
169
171 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
173 cl::desc("Max depth to recursively search for missing "
174 "frames through tail calls."));
175
176// Optionally enable cloning of callsites involved with recursive cycles
178 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
179 cl::desc("Allow cloning of callsites involved in recursive cycles"));
180
182 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
183 cl::desc("Allow cloning of contexts through recursive cycles"));
184
185// Generally this is needed for correct assignment of allocation clones to
186// function clones, however, allow it to be disabled for debugging while the
187// functionality is new and being tested more widely.
188static cl::opt<bool>
189 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
190 cl::desc("Merge clones before assigning functions"));
191
192// When disabled, try to detect and prevent cloning of recursive contexts.
193// This is only necessary until we support cloning through recursive cycles.
194// Leave on by default for now, as disabling requires a little bit of compile
195// time overhead and doesn't affect correctness, it will just inflate the cold
196// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
198 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
199 cl::desc("Allow cloning of contexts having recursive cycles"));
200
201// Set the minimum absolute count threshold for allowing inlining of indirect
202// calls promoted during cloning.
204 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
205 cl::desc("Minimum absolute count for promoted target to be inlinable"));
206
207namespace llvm {
209 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
210 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
211
212// Indicate we are linking with an allocator that supports hot/cold operator
213// new interfaces.
215 "supports-hot-cold-new", cl::init(false), cl::Hidden,
216 cl::desc("Linking with hot/cold operator new interfaces"));
217
219 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
220 cl::desc(
221 "Require target function definition when promoting indirect calls"));
222
225
226} // namespace llvm
227
228namespace {
229/// CRTP base for graphs built from either IR or ThinLTO summary index.
230///
231/// The graph represents the call contexts in all memprof metadata on allocation
232/// calls, with nodes for the allocations themselves, as well as for the calls
233/// in each context. The graph is initially built from the allocation memprof
234/// metadata (or summary) MIBs. It is then updated to match calls with callsite
235/// metadata onto the nodes, updating it to reflect any inlining performed on
236/// those calls.
237///
238/// Each MIB (representing an allocation's call context with allocation
239/// behavior) is assigned a unique context id during the graph build. The edges
240/// and nodes in the graph are decorated with the context ids they carry. This
241/// is used to correctly update the graph when cloning is performed so that we
242/// can uniquify the context for a single (possibly cloned) allocation.
243template <typename DerivedCCG, typename FuncTy, typename CallTy>
244class CallsiteContextGraph {
245public:
246 CallsiteContextGraph() = default;
247 CallsiteContextGraph(const CallsiteContextGraph &) = default;
248 CallsiteContextGraph(CallsiteContextGraph &&) = default;
249
250 /// Main entry point to perform analysis and transformations on graph.
251 bool process();
252
253 /// Perform cloning on the graph necessary to uniquely identify the allocation
254 /// behavior of an allocation based on its context.
255 void identifyClones();
256
257 /// Assign callsite clones to functions, cloning functions as needed to
258 /// accommodate the combinations of their callsite clones reached by callers.
259 /// For regular LTO this clones functions and callsites in the IR, but for
260 /// ThinLTO the cloning decisions are noted in the summaries and later applied
261 /// in applyImport.
262 bool assignFunctions();
263
264 void dump() const;
265 void print(raw_ostream &OS) const;
266 void printTotalSizes(raw_ostream &OS) const;
267
269 const CallsiteContextGraph &CCG) {
270 CCG.print(OS);
271 return OS;
272 }
273
274 friend struct GraphTraits<
275 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
276 friend struct DOTGraphTraits<
277 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
278
279 void exportToDot(std::string Label) const;
280
281 /// Represents a function clone via FuncTy pointer and clone number pair.
282 struct FuncInfo final
283 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
284 using Base = std::pair<FuncTy *, unsigned>;
285 FuncInfo(const Base &B) : Base(B) {}
286 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
287 explicit operator bool() const { return this->first != nullptr; }
288 FuncTy *func() const { return this->first; }
289 unsigned cloneNo() const { return this->second; }
290 };
291
292 /// Represents a callsite clone via CallTy and clone number pair.
293 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
294 using Base = std::pair<CallTy, unsigned>;
295 CallInfo(const Base &B) : Base(B) {}
296 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
297 : Base(Call, CloneNo) {}
298 explicit operator bool() const { return (bool)this->first; }
299 CallTy call() const { return this->first; }
300 unsigned cloneNo() const { return this->second; }
301 void setCloneNo(unsigned N) { this->second = N; }
302 void print(raw_ostream &OS) const {
303 if (!operator bool()) {
304 assert(!cloneNo());
305 OS << "null Call";
306 return;
307 }
308 call()->print(OS);
309 OS << "\t(clone " << cloneNo() << ")";
310 }
311 void dump() const {
312 print(dbgs());
313 dbgs() << "\n";
314 }
315 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
316 Call.print(OS);
317 return OS;
318 }
319 };
320
321 struct ContextEdge;
322
323 /// Node in the Callsite Context Graph
324 struct ContextNode {
325 // Assigned to nodes as they are created, useful for debugging.
326 unsigned NodeId = 0;
327
328 // Keep this for now since in the IR case where we have an Instruction* it
329 // is not as immediately discoverable. Used for printing richer information
330 // when dumping graph.
331 bool IsAllocation;
332
333 // Keeps track of when the Call was reset to null because there was
334 // recursion.
335 bool Recursive = false;
336
337 // This will be formed by ORing together the AllocationType enum values
338 // for contexts including this node.
339 uint8_t AllocTypes = 0;
340
341 // The corresponding allocation or interior call. This is the primary call
342 // for which we have created this node.
343 CallInfo Call;
344
345 // List of other calls that can be treated the same as the primary call
346 // through cloning. I.e. located in the same function and have the same
347 // (possibly pruned) stack ids. They will be updated the same way as the
348 // primary call when assigning to function clones.
349 SmallVector<CallInfo, 0> MatchingCalls;
350
351 // For alloc nodes this is a unique id assigned when constructed, and for
352 // callsite stack nodes it is the original stack id when the node is
353 // constructed from the memprof MIB metadata on the alloc nodes. Note that
354 // this is only used when matching callsite metadata onto the stack nodes
355 // created when processing the allocation memprof MIBs, and for labeling
356 // nodes in the dot graph. Therefore we don't bother to assign a value for
357 // clones.
358 uint64_t OrigStackOrAllocId = 0;
359
360 // Edges to all callees in the profiled call stacks.
361 // TODO: Should this be a map (from Callee node) for more efficient lookup?
362 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
363
364 // Edges to all callers in the profiled call stacks.
365 // TODO: Should this be a map (from Caller node) for more efficient lookup?
366 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
367
368 // Returns true if we need to look at the callee edges for determining the
369 // node context ids and allocation type.
370 bool useCallerEdgesForContextInfo() const {
371 // Typically if the callee edges are empty either the caller edges are
372 // also empty, or this is an allocation (leaf node). However, if we are
373 // allowing recursive callsites and contexts this will be violated for
374 // incompletely cloned recursive cycles.
375 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
377 // When cloning for a recursive context, during cloning we might be in the
378 // midst of cloning for a recurrence and have moved context ids off of a
379 // caller edge onto the clone but not yet off of the incoming caller
380 // (back) edge. If we don't look at those we miss the fact that this node
381 // still has context ids of interest.
382 return IsAllocation || CloneRecursiveContexts;
383 }
384
385 // Compute the context ids for this node from the union of its edge context
386 // ids.
387 DenseSet<uint32_t> getContextIds() const {
388 unsigned Count = 0;
389 // Compute the number of ids for reserve below. In general we only need to
390 // look at one set of edges, typically the callee edges, since other than
391 // allocations and in some cases during recursion cloning, all the context
392 // ids on the callers should also flow out via callee edges.
393 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
394 Count += Edge->getContextIds().size();
395 DenseSet<uint32_t> ContextIds;
396 ContextIds.reserve(Count);
398 CalleeEdges, useCallerEdgesForContextInfo()
399 ? CallerEdges
400 : std::vector<std::shared_ptr<ContextEdge>>());
401 for (const auto &Edge : Edges)
402 ContextIds.insert_range(Edge->getContextIds());
403 return ContextIds;
404 }
405
406 // Compute the allocation type for this node from the OR of its edge
407 // allocation types.
408 uint8_t computeAllocType() const {
409 uint8_t BothTypes =
413 CalleeEdges, useCallerEdgesForContextInfo()
414 ? CallerEdges
415 : std::vector<std::shared_ptr<ContextEdge>>());
416 for (const auto &Edge : Edges) {
417 AllocType |= Edge->AllocTypes;
418 // Bail early if alloc type reached both, no further refinement.
419 if (AllocType == BothTypes)
420 return AllocType;
421 }
422 return AllocType;
423 }
424
425 // The context ids set for this node is empty if its edge context ids are
426 // also all empty.
427 bool emptyContextIds() const {
429 CalleeEdges, useCallerEdgesForContextInfo()
430 ? CallerEdges
431 : std::vector<std::shared_ptr<ContextEdge>>());
432 for (const auto &Edge : Edges) {
433 if (!Edge->getContextIds().empty())
434 return false;
435 }
436 return true;
437 }
438
439 // List of clones of this ContextNode, initially empty.
440 std::vector<ContextNode *> Clones;
441
442 // If a clone, points to the original uncloned node.
443 ContextNode *CloneOf = nullptr;
444
445 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
446
447 ContextNode(bool IsAllocation, CallInfo C)
448 : IsAllocation(IsAllocation), Call(C) {}
449
450 void addClone(ContextNode *Clone) {
451 if (CloneOf) {
452 CloneOf->Clones.push_back(Clone);
453 Clone->CloneOf = CloneOf;
454 } else {
455 Clones.push_back(Clone);
456 assert(!Clone->CloneOf);
457 Clone->CloneOf = this;
458 }
459 }
460
461 ContextNode *getOrigNode() {
462 if (!CloneOf)
463 return this;
464 return CloneOf;
465 }
466
467 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
468 unsigned int ContextId);
469
470 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
471 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
472 void eraseCalleeEdge(const ContextEdge *Edge);
473 void eraseCallerEdge(const ContextEdge *Edge);
474
475 void setCall(CallInfo C) { Call = C; }
476
477 bool hasCall() const { return (bool)Call.call(); }
478
479 void printCall(raw_ostream &OS) const { Call.print(OS); }
480
481 // True if this node was effectively removed from the graph, in which case
482 // it should have an allocation type of None and empty context ids.
483 bool isRemoved() const {
484 // Typically if the callee edges are empty either the caller edges are
485 // also empty, or this is an allocation (leaf node). However, if we are
486 // allowing recursive callsites and contexts this will be violated for
487 // incompletely cloned recursive cycles.
489 (AllocTypes == (uint8_t)AllocationType::None) ==
490 emptyContextIds());
491 return AllocTypes == (uint8_t)AllocationType::None;
492 }
493
494 void dump() const;
495 void print(raw_ostream &OS) const;
496
497 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
498 Node.print(OS);
499 return OS;
500 }
501 };
502
503 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
504 /// callee.
505 struct ContextEdge {
506 ContextNode *Callee;
507 ContextNode *Caller;
508
509 // This will be formed by ORing together the AllocationType enum values
510 // for contexts including this edge.
511 uint8_t AllocTypes = 0;
512
513 // Set just before initiating cloning when cloning of recursive contexts is
514 // enabled. Used to defer cloning of backedges until we have done cloning of
515 // the callee node for non-backedge caller edges. This exposes cloning
516 // opportunities through the backedge of the cycle.
517 // TODO: Note that this is not updated during cloning, and it is unclear
518 // whether that would be needed.
519 bool IsBackedge = false;
520
521 // The set of IDs for contexts including this edge.
522 DenseSet<uint32_t> ContextIds;
523
524 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
525 DenseSet<uint32_t> ContextIds)
526 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
527 ContextIds(std::move(ContextIds)) {}
528
529 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
530
531 // Helper to clear the fields of this edge when we are removing it from the
532 // graph.
533 inline void clear() {
534 ContextIds.clear();
535 AllocTypes = (uint8_t)AllocationType::None;
536 Caller = nullptr;
537 Callee = nullptr;
538 }
539
540 // Check if edge was removed from the graph. This is useful while iterating
541 // over a copy of edge lists when performing operations that mutate the
542 // graph in ways that might remove one of the edges.
543 inline bool isRemoved() const {
544 if (Callee || Caller)
545 return false;
546 // Any edges that have been removed from the graph but are still in a
547 // shared_ptr somewhere should have all fields null'ed out by clear()
548 // above.
549 assert(AllocTypes == (uint8_t)AllocationType::None);
550 assert(ContextIds.empty());
551 return true;
552 }
553
554 void dump() const;
555 void print(raw_ostream &OS) const;
556
557 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
558 Edge.print(OS);
559 return OS;
560 }
561 };
562
563 /// Helpers to remove edges that have allocation type None (due to not
564 /// carrying any context ids) after transformations.
565 void removeNoneTypeCalleeEdges(ContextNode *Node);
566 void removeNoneTypeCallerEdges(ContextNode *Node);
567 void
568 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
570
571protected:
572 /// Get a list of nodes corresponding to the stack ids in the given callsite
573 /// context.
574 template <class NodeT, class IteratorT>
575 std::vector<uint64_t>
576 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
577
578 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
579 /// metadata (or summary).
580 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
581
582 /// Adds nodes for the given MIB stack ids.
583 template <class NodeT, class IteratorT>
584 void addStackNodesForMIB(ContextNode *AllocNode,
585 CallStack<NodeT, IteratorT> &StackContext,
586 CallStack<NodeT, IteratorT> &CallsiteContext,
588 ArrayRef<ContextTotalSize> ContextSizeInfo);
589
590 /// Matches all callsite metadata (or summary) to the nodes created for
591 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
592 /// inlining performed on those callsite instructions.
593 void updateStackNodes();
594
595 /// Update graph to conservatively handle any callsite stack nodes that target
596 /// multiple different callee target functions.
597 void handleCallsitesWithMultipleTargets();
598
599 /// Mark backedges via the standard DFS based backedge algorithm.
600 void markBackedges();
601
602 /// Merge clones generated during cloning for different allocations but that
603 /// are called by the same caller node, to ensure proper function assignment.
604 void mergeClones();
605
606 // Try to partition calls on the given node (already placed into the AllCalls
607 // array) by callee function, creating new copies of Node as needed to hold
608 // calls with different callees, and moving the callee edges appropriately.
609 // Returns true if partitioning was successful.
610 bool partitionCallsByCallee(
611 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
612 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
613
614 /// Save lists of calls with MemProf metadata in each function, for faster
615 /// iteration.
616 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
617
618 /// Map from callsite node to the enclosing caller function.
619 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
620
621 // When exporting to dot, and an allocation id is specified, contains the
622 // context ids on that allocation.
623 DenseSet<uint32_t> DotAllocContextIds;
624
625private:
626 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
627
628 // Structure to keep track of information for each call as we are matching
629 // non-allocation callsites onto context nodes created from the allocation
630 // call metadata / summary contexts.
631 struct CallContextInfo {
632 // The callsite we're trying to match.
633 CallTy Call;
634 // The callsites stack ids that have a context node in the graph.
635 std::vector<uint64_t> StackIds;
636 // The function containing this callsite.
637 const FuncTy *Func;
638 // Initially empty, if needed this will be updated to contain the context
639 // ids for use in a new context node created for this callsite.
640 DenseSet<uint32_t> ContextIds;
641 };
642
643 /// Helper to remove edge from graph, updating edge iterator if it is provided
644 /// (in which case CalleeIter indicates which edge list is being iterated).
645 /// This will also perform the necessary clearing of the ContextEdge members
646 /// to enable later checking if the edge has been removed (since we may have
647 /// other copies of the shared_ptr in existence, and in fact rely on this to
648 /// enable removal while iterating over a copy of a node's edge list).
649 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
650 bool CalleeIter = true);
651
652 /// Assigns the given Node to calls at or inlined into the location with
653 /// the Node's stack id, after post order traversing and processing its
654 /// caller nodes. Uses the call information recorded in the given
655 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
656 /// as needed. Called by updateStackNodes which sets up the given
657 /// StackIdToMatchingCalls map.
658 void assignStackNodesPostOrder(
659 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
660 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
661 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
662
663 /// Duplicates the given set of context ids, updating the provided
664 /// map from each original id with the newly generated context ids,
665 /// and returning the new duplicated id set.
666 DenseSet<uint32_t> duplicateContextIds(
667 const DenseSet<uint32_t> &StackSequenceContextIds,
668 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
669
670 /// Propagates all duplicated context ids across the graph.
671 void propagateDuplicateContextIds(
672 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
673
674 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
675 /// else to its callers. Also updates OrigNode's edges to remove any context
676 /// ids moved to the newly created edge.
677 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
678 bool TowardsCallee,
679 DenseSet<uint32_t> RemainingContextIds);
680
681 /// Get the stack id corresponding to the given Id or Index (for IR this will
682 /// return itself, for a summary index this will return the id recorded in the
683 /// index for that stack id index value).
684 uint64_t getStackId(uint64_t IdOrIndex) const {
685 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
686 }
687
688 /// Returns true if the given call targets the callee of the given edge, or if
689 /// we were able to identify the call chain through intermediate tail calls.
690 /// In the latter case new context nodes are added to the graph for the
691 /// identified tail calls, and their synthesized nodes are added to
692 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
693 /// the updated edges and to prepare it for an increment in the caller.
694 bool
695 calleesMatch(CallTy Call, EdgeIter &EI,
696 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
697
698 // Return the callee function of the given call, or nullptr if it can't be
699 // determined
700 const FuncTy *getCalleeFunc(CallTy Call) {
701 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
702 }
703
704 /// Returns true if the given call targets the given function, or if we were
705 /// able to identify the call chain through intermediate tail calls (in which
706 /// case FoundCalleeChain will be populated).
707 bool calleeMatchesFunc(
708 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
709 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
710 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
711 Call, Func, CallerFunc, FoundCalleeChain);
712 }
713
714 /// Returns true if both call instructions have the same callee.
715 bool sameCallee(CallTy Call1, CallTy Call2) {
716 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
717 }
718
719 /// Get a list of nodes corresponding to the stack ids in the given
720 /// callsite's context.
721 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
722 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
723 Call);
724 }
725
726 /// Get the last stack id in the context for callsite.
727 uint64_t getLastStackId(CallTy Call) {
728 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
729 }
730
731 /// Update the allocation call to record type of allocated memory.
732 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
733 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
734 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
735 }
736
737 /// Get the AllocationType assigned to the given allocation instruction clone.
738 AllocationType getAllocationCallType(const CallInfo &Call) const {
739 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
740 }
741
742 /// Update non-allocation call to invoke (possibly cloned) function
743 /// CalleeFunc.
744 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
745 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
746 }
747
748 /// Clone the given function for the given callsite, recording mapping of all
749 /// of the functions tracked calls to their new versions in the CallMap.
750 /// Assigns new clones to clone number CloneNo.
751 FuncInfo cloneFunctionForCallsite(
752 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
753 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
754 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
755 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
756 }
757
758 /// Gets a label to use in the dot graph for the given call clone in the given
759 /// function.
760 std::string getLabel(const FuncTy *Func, const CallTy Call,
761 unsigned CloneNo) const {
762 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
763 }
764
765 // Create and return a new ContextNode.
766 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
767 CallInfo C = CallInfo()) {
768 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
769 auto *NewNode = NodeOwner.back().get();
770 if (F)
771 NodeToCallingFunc[NewNode] = F;
772 NewNode->NodeId = NodeOwner.size();
773 return NewNode;
774 }
775
776 /// Helpers to find the node corresponding to the given call or stackid.
777 ContextNode *getNodeForInst(const CallInfo &C);
778 ContextNode *getNodeForAlloc(const CallInfo &C);
779 ContextNode *getNodeForStackId(uint64_t StackId);
780
781 /// Computes the alloc type corresponding to the given context ids, by
782 /// unioning their recorded alloc types.
783 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
784
785 /// Returns the allocation type of the intersection of the contexts of two
786 /// nodes (based on their provided context id sets), optimized for the case
787 /// when Node1Ids is smaller than Node2Ids.
788 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
789 const DenseSet<uint32_t> &Node2Ids) const;
790
791 /// Returns the allocation type of the intersection of the contexts of two
792 /// nodes (based on their provided context id sets).
793 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
794 const DenseSet<uint32_t> &Node2Ids) const;
795
796 /// Create a clone of Edge's callee and move Edge to that new callee node,
797 /// performing the necessary context id and allocation type updates.
798 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
799 /// moved to an edge to the new callee.
800 ContextNode *
801 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
802 DenseSet<uint32_t> ContextIdsToMove = {});
803
804 /// Change the callee of Edge to existing callee clone NewCallee, performing
805 /// the necessary context id and allocation type updates.
806 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
807 /// moved to an edge to the new callee.
808 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
809 ContextNode *NewCallee,
810 bool NewClone = false,
811 DenseSet<uint32_t> ContextIdsToMove = {});
812
813 /// Change the caller of the edge at the given callee edge iterator to be
814 /// NewCaller, performing the necessary context id and allocation type
815 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
816 /// a simplified version of it as we always move the given edge and all of its
817 /// context ids.
818 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
819 ContextNode *NewCaller);
820
821 /// Recursive helper for marking backedges via DFS.
822 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
823 DenseSet<const ContextNode *> &CurrentStack);
824
825 /// Recursive helper for merging clones.
826 void
827 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
828 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
829 /// Main worker for merging callee clones for a given node.
830 void mergeNodeCalleeClones(
831 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
832 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
833 /// Helper to find other callers of the given set of callee edges that can
834 /// share the same callee merge node.
835 void findOtherCallersToShareMerge(
836 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
837 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
838 DenseSet<ContextNode *> &OtherCallersToShareMerge);
839
840 /// Recursively perform cloning on the graph for the given Node and its
841 /// callers, in order to uniquely identify the allocation behavior of an
842 /// allocation given its context. The context ids of the allocation being
843 /// processed are given in AllocContextIds.
844 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
845 const DenseSet<uint32_t> &AllocContextIds);
846
847 /// Map from each context ID to the AllocationType assigned to that context.
848 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
849
850 /// Map from each contextID to the profiled full contexts and their total
851 /// sizes (there may be more than one due to context trimming),
852 /// optionally populated when requested (via MemProfReportHintedSizes or
853 /// MinClonedColdBytePercent).
854 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
855
856 /// Identifies the context node created for a stack id when adding the MIB
857 /// contexts to the graph. This is used to locate the context nodes when
858 /// trying to assign the corresponding callsites with those stack ids to these
859 /// nodes.
860 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
861
862 /// Maps to track the calls to their corresponding nodes in the graph.
863 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
864 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
865
866 /// Owner of all ContextNode unique_ptrs.
867 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
868
869 /// Perform sanity checks on graph when requested.
870 void check() const;
871
872 /// Keeps track of the last unique context id assigned.
873 unsigned int LastContextId = 0;
874};
875
876template <typename DerivedCCG, typename FuncTy, typename CallTy>
877using ContextNode =
878 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
879template <typename DerivedCCG, typename FuncTy, typename CallTy>
880using ContextEdge =
881 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
882template <typename DerivedCCG, typename FuncTy, typename CallTy>
883using FuncInfo =
884 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
885template <typename DerivedCCG, typename FuncTy, typename CallTy>
886using CallInfo =
887 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
888
889/// CRTP derived class for graphs built from IR (regular LTO).
890class ModuleCallsiteContextGraph
891 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
892 Instruction *> {
893public:
894 ModuleCallsiteContextGraph(
895 Module &M,
896 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
897
898private:
899 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
900 Instruction *>;
901
902 uint64_t getStackId(uint64_t IdOrIndex) const;
903 const Function *getCalleeFunc(Instruction *Call);
904 bool calleeMatchesFunc(
905 Instruction *Call, const Function *Func, const Function *CallerFunc,
906 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
907 bool sameCallee(Instruction *Call1, Instruction *Call2);
908 bool findProfiledCalleeThroughTailCalls(
909 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
910 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
911 bool &FoundMultipleCalleeChains);
912 uint64_t getLastStackId(Instruction *Call);
913 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
914 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
915 AllocationType getAllocationCallType(const CallInfo &Call) const;
916 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
917 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
918 Instruction *>::FuncInfo
919 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
920 DenseMap<CallInfo, CallInfo> &CallMap,
921 std::vector<CallInfo> &CallsWithMetadataInFunc,
922 unsigned CloneNo);
923 std::string getLabel(const Function *Func, const Instruction *Call,
924 unsigned CloneNo) const;
925
926 const Module &Mod;
927 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
928};
929
930/// Represents a call in the summary index graph, which can either be an
931/// allocation or an interior callsite node in an allocation's context.
932/// Holds a pointer to the corresponding data structure in the index.
933struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
934 IndexCall() : PointerUnion() {}
935 IndexCall(std::nullptr_t) : IndexCall() {}
936 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
937 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
938 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
939
940 IndexCall *operator->() { return this; }
941
942 void print(raw_ostream &OS) const {
943 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
945 OS << *AI;
946 } else {
948 assert(CI);
949 OS << *CI;
950 }
951 }
952};
953} // namespace
954
955namespace llvm {
956template <> struct simplify_type<IndexCall> {
958 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
959};
960template <> struct simplify_type<const IndexCall> {
962 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
963};
964} // namespace llvm
965
966namespace {
967/// CRTP derived class for graphs built from summary index (ThinLTO).
968class IndexCallsiteContextGraph
969 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
970 IndexCall> {
971public:
972 IndexCallsiteContextGraph(
973 ModuleSummaryIndex &Index,
974 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
975 isPrevailing);
976
977 ~IndexCallsiteContextGraph() {
978 // Now that we are done with the graph it is safe to add the new
979 // CallsiteInfo structs to the function summary vectors. The graph nodes
980 // point into locations within these vectors, so we don't want to add them
981 // any earlier.
982 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
983 auto *FS = I.first;
984 for (auto &Callsite : I.second)
985 FS->addCallsite(*Callsite.second);
986 }
987 }
988
989private:
990 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
991 IndexCall>;
992
993 uint64_t getStackId(uint64_t IdOrIndex) const;
994 const FunctionSummary *getCalleeFunc(IndexCall &Call);
995 bool calleeMatchesFunc(
996 IndexCall &Call, const FunctionSummary *Func,
997 const FunctionSummary *CallerFunc,
998 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
999 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1000 bool findProfiledCalleeThroughTailCalls(
1001 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1002 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1003 bool &FoundMultipleCalleeChains);
1004 uint64_t getLastStackId(IndexCall &Call);
1005 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1006 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1007 AllocationType getAllocationCallType(const CallInfo &Call) const;
1008 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1009 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1010 IndexCall>::FuncInfo
1011 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1012 DenseMap<CallInfo, CallInfo> &CallMap,
1013 std::vector<CallInfo> &CallsWithMetadataInFunc,
1014 unsigned CloneNo);
1015 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1016 unsigned CloneNo) const;
1017
1018 // Saves mapping from function summaries containing memprof records back to
1019 // its VI, for use in checking and debugging.
1020 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1021
1022 const ModuleSummaryIndex &Index;
1023 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1024 isPrevailing;
1025
1026 // Saves/owns the callsite info structures synthesized for missing tail call
1027 // frames that we discover while building the graph.
1028 // It maps from the summary of the function making the tail call, to a map
1029 // of callee ValueInfo to corresponding synthesized callsite info.
1030 std::unordered_map<FunctionSummary *,
1031 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1032 FunctionCalleesToSynthesizedCallsiteInfos;
1033};
1034} // namespace
1035
1036template <>
1037struct llvm::DenseMapInfo<typename CallsiteContextGraph<
1038 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1040template <>
1041struct llvm::DenseMapInfo<typename CallsiteContextGraph<
1042 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1043 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1044template <>
1045struct llvm::DenseMapInfo<IndexCall>
1046 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1047
1048namespace {
1049
1050// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1051// type we should actually use on the corresponding allocation.
1052// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1053// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1054// from NotCold.
1055AllocationType allocTypeToUse(uint8_t AllocTypes) {
1056 assert(AllocTypes != (uint8_t)AllocationType::None);
1057 if (AllocTypes ==
1060 else
1061 return (AllocationType)AllocTypes;
1062}
1063
1064// Helper to check if the alloc types for all edges recorded in the
1065// InAllocTypes vector match the alloc types for all edges in the Edges
1066// vector.
1067template <typename DerivedCCG, typename FuncTy, typename CallTy>
1068bool allocTypesMatch(
1069 const std::vector<uint8_t> &InAllocTypes,
1070 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1071 &Edges) {
1072 // This should be called only when the InAllocTypes vector was computed for
1073 // this set of Edges. Make sure the sizes are the same.
1074 assert(InAllocTypes.size() == Edges.size());
1075 return std::equal(
1076 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1077 [](const uint8_t &l,
1078 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1079 // Can share if one of the edges is None type - don't
1080 // care about the type along that edge as it doesn't
1081 // exist for those context ids.
1082 if (l == (uint8_t)AllocationType::None ||
1083 r->AllocTypes == (uint8_t)AllocationType::None)
1084 return true;
1085 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1086 });
1087}
1088
1089// Helper to check if the alloc types for all edges recorded in the
1090// InAllocTypes vector match the alloc types for callee edges in the given
1091// clone. Because the InAllocTypes were computed from the original node's callee
1092// edges, and other cloning could have happened after this clone was created, we
1093// need to find the matching clone callee edge, which may or may not exist.
1094template <typename DerivedCCG, typename FuncTy, typename CallTy>
1095bool allocTypesMatchClone(
1096 const std::vector<uint8_t> &InAllocTypes,
1097 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1098 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1099 assert(Node);
1100 // InAllocTypes should have been computed for the original node's callee
1101 // edges.
1102 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1103 // First create a map of the clone callee edge callees to the edge alloc type.
1105 EdgeCalleeMap;
1106 for (const auto &E : Clone->CalleeEdges) {
1107 assert(!EdgeCalleeMap.contains(E->Callee));
1108 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1109 }
1110 // Next, walk the original node's callees, and look for the corresponding
1111 // clone edge to that callee.
1112 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1113 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1114 // Not found is ok, we will simply add an edge if we use this clone.
1115 if (Iter == EdgeCalleeMap.end())
1116 continue;
1117 // Can share if one of the edges is None type - don't
1118 // care about the type along that edge as it doesn't
1119 // exist for those context ids.
1120 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1121 Iter->second == (uint8_t)AllocationType::None)
1122 continue;
1123 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1124 return false;
1125 }
1126 return true;
1127}
1128
1129} // end anonymous namespace
1130
1131template <typename DerivedCCG, typename FuncTy, typename CallTy>
1132typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1133CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1134 const CallInfo &C) {
1135 ContextNode *Node = getNodeForAlloc(C);
1136 if (Node)
1137 return Node;
1138
1139 return NonAllocationCallToContextNodeMap.lookup(C);
1140}
1141
1142template <typename DerivedCCG, typename FuncTy, typename CallTy>
1143typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1144CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1145 const CallInfo &C) {
1146 return AllocationCallToContextNodeMap.lookup(C);
1147}
1148
1149template <typename DerivedCCG, typename FuncTy, typename CallTy>
1150typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1151CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1152 uint64_t StackId) {
1153 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1154 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1155 return StackEntryNode->second;
1156 return nullptr;
1157}
1158
1159template <typename DerivedCCG, typename FuncTy, typename CallTy>
1160void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1161 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1162 unsigned int ContextId) {
1163 for (auto &Edge : CallerEdges) {
1164 if (Edge->Caller == Caller) {
1165 Edge->AllocTypes |= (uint8_t)AllocType;
1166 Edge->getContextIds().insert(ContextId);
1167 return;
1168 }
1169 }
1170 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1171 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1172 CallerEdges.push_back(Edge);
1173 Caller->CalleeEdges.push_back(Edge);
1174}
1175
1176template <typename DerivedCCG, typename FuncTy, typename CallTy>
1177void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1178 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1179 assert(!EI || (*EI)->get() == Edge);
1180 assert(!Edge->isRemoved());
1181 // Save the Caller and Callee pointers so we can erase Edge from their edge
1182 // lists after clearing Edge below. We do the clearing first in case it is
1183 // destructed after removing from the edge lists (if those were the last
1184 // shared_ptr references to Edge).
1185 auto *Callee = Edge->Callee;
1186 auto *Caller = Edge->Caller;
1187
1188 // Make sure the edge fields are cleared out so we can properly detect
1189 // removed edges if Edge is not destructed because there is still a shared_ptr
1190 // reference.
1191 Edge->clear();
1192
1193#ifndef NDEBUG
1194 auto CalleeCallerCount = Callee->CallerEdges.size();
1195 auto CallerCalleeCount = Caller->CalleeEdges.size();
1196#endif
1197 if (!EI) {
1198 Callee->eraseCallerEdge(Edge);
1199 Caller->eraseCalleeEdge(Edge);
1200 } else if (CalleeIter) {
1201 Callee->eraseCallerEdge(Edge);
1202 *EI = Caller->CalleeEdges.erase(*EI);
1203 } else {
1204 Caller->eraseCalleeEdge(Edge);
1205 *EI = Callee->CallerEdges.erase(*EI);
1206 }
1207 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1208 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1209}
1210
1211template <typename DerivedCCG, typename FuncTy, typename CallTy>
1212void CallsiteContextGraph<
1213 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1214 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1215 auto Edge = *EI;
1216 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1217 assert(Edge->ContextIds.empty());
1218 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1219 } else
1220 ++EI;
1221 }
1222}
1223
1224template <typename DerivedCCG, typename FuncTy, typename CallTy>
1225void CallsiteContextGraph<
1226 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1227 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1228 auto Edge = *EI;
1229 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1230 assert(Edge->ContextIds.empty());
1231 Edge->Caller->eraseCalleeEdge(Edge.get());
1232 EI = Node->CallerEdges.erase(EI);
1233 } else
1234 ++EI;
1235 }
1236}
1237
1238template <typename DerivedCCG, typename FuncTy, typename CallTy>
1239typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1240CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1241 findEdgeFromCallee(const ContextNode *Callee) {
1242 for (const auto &Edge : CalleeEdges)
1243 if (Edge->Callee == Callee)
1244 return Edge.get();
1245 return nullptr;
1246}
1247
1248template <typename DerivedCCG, typename FuncTy, typename CallTy>
1249typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1250CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1251 findEdgeFromCaller(const ContextNode *Caller) {
1252 for (const auto &Edge : CallerEdges)
1253 if (Edge->Caller == Caller)
1254 return Edge.get();
1255 return nullptr;
1256}
1257
1258template <typename DerivedCCG, typename FuncTy, typename CallTy>
1259void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1260 eraseCalleeEdge(const ContextEdge *Edge) {
1261 auto EI = llvm::find_if(
1262 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1263 return CalleeEdge.get() == Edge;
1264 });
1265 assert(EI != CalleeEdges.end());
1266 CalleeEdges.erase(EI);
1267}
1268
1269template <typename DerivedCCG, typename FuncTy, typename CallTy>
1270void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1271 eraseCallerEdge(const ContextEdge *Edge) {
1272 auto EI = llvm::find_if(
1273 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1274 return CallerEdge.get() == Edge;
1275 });
1276 assert(EI != CallerEdges.end());
1277 CallerEdges.erase(EI);
1278}
1279
1280template <typename DerivedCCG, typename FuncTy, typename CallTy>
1281uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1282 DenseSet<uint32_t> &ContextIds) const {
1283 uint8_t BothTypes =
1284 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1285 uint8_t AllocType = (uint8_t)AllocationType::None;
1286 for (auto Id : ContextIds) {
1287 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1288 // Bail early if alloc type reached both, no further refinement.
1289 if (AllocType == BothTypes)
1290 return AllocType;
1291 }
1292 return AllocType;
1293}
1294
1295template <typename DerivedCCG, typename FuncTy, typename CallTy>
1296uint8_t
1297CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1298 const DenseSet<uint32_t> &Node1Ids,
1299 const DenseSet<uint32_t> &Node2Ids) const {
1300 uint8_t BothTypes =
1301 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1302 uint8_t AllocType = (uint8_t)AllocationType::None;
1303 for (auto Id : Node1Ids) {
1304 if (!Node2Ids.count(Id))
1305 continue;
1306 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1307 // Bail early if alloc type reached both, no further refinement.
1308 if (AllocType == BothTypes)
1309 return AllocType;
1310 }
1311 return AllocType;
1312}
1313
1314template <typename DerivedCCG, typename FuncTy, typename CallTy>
1315uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1316 const DenseSet<uint32_t> &Node1Ids,
1317 const DenseSet<uint32_t> &Node2Ids) const {
1318 if (Node1Ids.size() < Node2Ids.size())
1319 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1320 else
1321 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1322}
1323
1324template <typename DerivedCCG, typename FuncTy, typename CallTy>
1325typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1326CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1327 CallInfo Call, const FuncTy *F) {
1328 assert(!getNodeForAlloc(Call));
1329 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1330 AllocationCallToContextNodeMap[Call] = AllocNode;
1331 // Use LastContextId as a uniq id for MIB allocation nodes.
1332 AllocNode->OrigStackOrAllocId = LastContextId;
1333 // Alloc type should be updated as we add in the MIBs. We should assert
1334 // afterwards that it is not still None.
1335 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1336
1337 return AllocNode;
1338}
1339
1340static std::string getAllocTypeString(uint8_t AllocTypes) {
1341 if (!AllocTypes)
1342 return "None";
1343 std::string Str;
1344 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1345 Str += "NotCold";
1346 if (AllocTypes & (uint8_t)AllocationType::Cold)
1347 Str += "Cold";
1348 return Str;
1349}
1350
1351template <typename DerivedCCG, typename FuncTy, typename CallTy>
1352template <class NodeT, class IteratorT>
1353void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1354 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1355 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1356 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1357 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1358 // is done.
1359 if (AllocType == AllocationType::Hot)
1360 AllocType = AllocationType::NotCold;
1361
1362 ContextIdToAllocationType[++LastContextId] = AllocType;
1363
1364 if (!ContextSizeInfo.empty()) {
1365 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1366 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1367 }
1368
1369 // Update alloc type and context ids for this MIB.
1370 AllocNode->AllocTypes |= (uint8_t)AllocType;
1371
1372 // Now add or update nodes for each stack id in alloc's context.
1373 // Later when processing the stack ids on non-alloc callsites we will adjust
1374 // for any inlining in the context.
1375 ContextNode *PrevNode = AllocNode;
1376 // Look for recursion (direct recursion should have been collapsed by
1377 // module summary analysis, here we should just be detecting mutual
1378 // recursion). Mark these nodes so we don't try to clone.
1379 SmallSet<uint64_t, 8> StackIdSet;
1380 // Skip any on the allocation call (inlining).
1381 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1382 ContextIter != StackContext.end(); ++ContextIter) {
1383 auto StackId = getStackId(*ContextIter);
1384 ContextNode *StackNode = getNodeForStackId(StackId);
1385 if (!StackNode) {
1386 StackNode = createNewNode(/*IsAllocation=*/false);
1387 StackEntryIdToContextNodeMap[StackId] = StackNode;
1388 StackNode->OrigStackOrAllocId = StackId;
1389 }
1390 // Marking a node recursive will prevent its cloning completely, even for
1391 // non-recursive contexts flowing through it.
1393 auto Ins = StackIdSet.insert(StackId);
1394 if (!Ins.second)
1395 StackNode->Recursive = true;
1396 }
1397 StackNode->AllocTypes |= (uint8_t)AllocType;
1398 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1399 PrevNode = StackNode;
1400 }
1401}
1402
1403template <typename DerivedCCG, typename FuncTy, typename CallTy>
1404DenseSet<uint32_t>
1405CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1406 const DenseSet<uint32_t> &StackSequenceContextIds,
1407 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1408 DenseSet<uint32_t> NewContextIds;
1409 for (auto OldId : StackSequenceContextIds) {
1410 NewContextIds.insert(++LastContextId);
1411 OldToNewContextIds[OldId].insert(LastContextId);
1412 assert(ContextIdToAllocationType.count(OldId));
1413 // The new context has the same allocation type as original.
1414 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1415 if (DotAllocContextIds.contains(OldId))
1416 DotAllocContextIds.insert(LastContextId);
1417 }
1418 return NewContextIds;
1419}
1420
1421template <typename DerivedCCG, typename FuncTy, typename CallTy>
1422void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1423 propagateDuplicateContextIds(
1424 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1425 // Build a set of duplicated context ids corresponding to the input id set.
1426 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1427 DenseSet<uint32_t> NewIds;
1428 for (auto Id : ContextIds)
1429 if (auto NewId = OldToNewContextIds.find(Id);
1430 NewId != OldToNewContextIds.end())
1431 NewIds.insert_range(NewId->second);
1432 return NewIds;
1433 };
1434
1435 // Recursively update context ids sets along caller edges.
1436 auto UpdateCallers = [&](ContextNode *Node,
1437 DenseSet<const ContextEdge *> &Visited,
1438 auto &&UpdateCallers) -> void {
1439 for (const auto &Edge : Node->CallerEdges) {
1440 auto Inserted = Visited.insert(Edge.get());
1441 if (!Inserted.second)
1442 continue;
1443 ContextNode *NextNode = Edge->Caller;
1444 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1445 // Only need to recursively iterate to NextNode via this caller edge if
1446 // it resulted in any added ids to NextNode.
1447 if (!NewIdsToAdd.empty()) {
1448 Edge->getContextIds().insert_range(NewIdsToAdd);
1449 UpdateCallers(NextNode, Visited, UpdateCallers);
1450 }
1451 }
1452 };
1453
1454 DenseSet<const ContextEdge *> Visited;
1455 for (auto &Entry : AllocationCallToContextNodeMap) {
1456 auto *Node = Entry.second;
1457 UpdateCallers(Node, Visited, UpdateCallers);
1458 }
1459}
1460
1461template <typename DerivedCCG, typename FuncTy, typename CallTy>
1462void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1463 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1464 // This must be passed by value to make a copy since it will be adjusted
1465 // as ids are moved.
1466 DenseSet<uint32_t> RemainingContextIds) {
1467 auto &OrigEdges =
1468 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1469 DenseSet<uint32_t> RecursiveContextIds;
1470 DenseSet<uint32_t> AllCallerContextIds;
1472 // Identify which context ids are recursive which is needed to properly
1473 // update the RemainingContextIds set. The relevant recursive context ids
1474 // are those that are in multiple edges.
1475 for (auto &CE : OrigEdges) {
1476 AllCallerContextIds.reserve(CE->getContextIds().size());
1477 for (auto Id : CE->getContextIds())
1478 if (!AllCallerContextIds.insert(Id).second)
1479 RecursiveContextIds.insert(Id);
1480 }
1481 }
1482 // Increment iterator in loop so that we can remove edges as needed.
1483 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1484 auto Edge = *EI;
1485 DenseSet<uint32_t> NewEdgeContextIds;
1486 DenseSet<uint32_t> NotFoundContextIds;
1487 // Remove any matching context ids from Edge, return set that were found and
1488 // removed, these are the new edge's context ids. Also update the remaining
1489 // (not found ids).
1490 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1491 NotFoundContextIds);
1492 // Update the remaining context ids set for the later edges. This is a
1493 // compile time optimization.
1494 if (RecursiveContextIds.empty()) {
1495 // No recursive ids, so all of the previously remaining context ids that
1496 // were not seen on this edge are the new remaining set.
1497 RemainingContextIds.swap(NotFoundContextIds);
1498 } else {
1499 // Keep the recursive ids in the remaining set as we expect to see those
1500 // on another edge. We can remove the non-recursive remaining ids that
1501 // were seen on this edge, however. We already have the set of remaining
1502 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1503 // non-recursive and only remove those. Note that despite the higher
1504 // overhead of updating the remaining context ids set when recursion
1505 // handling is enabled, it was found to be at worst performance neutral
1506 // and in one case a clear win.
1507 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1508 set_difference(NewEdgeContextIds, RecursiveContextIds);
1509 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1510 }
1511 // If no matching context ids for this edge, skip it.
1512 if (NewEdgeContextIds.empty()) {
1513 ++EI;
1514 continue;
1515 }
1516 if (TowardsCallee) {
1517 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1518 auto NewEdge = std::make_shared<ContextEdge>(
1519 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1520 NewNode->CalleeEdges.push_back(NewEdge);
1521 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1522 } else {
1523 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1524 auto NewEdge = std::make_shared<ContextEdge>(
1525 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1526 NewNode->CallerEdges.push_back(NewEdge);
1527 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1528 }
1529 // Remove old edge if context ids empty.
1530 if (Edge->getContextIds().empty()) {
1531 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1532 continue;
1533 }
1534 ++EI;
1535 }
1536}
1537
1538template <typename DerivedCCG, typename FuncTy, typename CallTy>
1539static void checkEdge(
1540 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1541 // Confirm that alloc type is not None and that we have at least one context
1542 // id.
1543 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1544 assert(!Edge->ContextIds.empty());
1545}
1546
1547template <typename DerivedCCG, typename FuncTy, typename CallTy>
1548static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1549 bool CheckEdges = true) {
1550 if (Node->isRemoved())
1551 return;
1552#ifndef NDEBUG
1553 // Compute node's context ids once for use in asserts.
1554 auto NodeContextIds = Node->getContextIds();
1555#endif
1556 // Node's context ids should be the union of both its callee and caller edge
1557 // context ids.
1558 if (Node->CallerEdges.size()) {
1559 DenseSet<uint32_t> CallerEdgeContextIds(
1560 Node->CallerEdges.front()->ContextIds);
1561 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1562 if (CheckEdges)
1564 set_union(CallerEdgeContextIds, Edge->ContextIds);
1565 }
1566 // Node can have more context ids than callers if some contexts terminate at
1567 // node and some are longer. If we are allowing recursive callsites and
1568 // contexts this will be violated for incompletely cloned recursive cycles,
1569 // so skip the checking in that case.
1571 NodeContextIds == CallerEdgeContextIds ||
1572 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1573 }
1574 if (Node->CalleeEdges.size()) {
1575 DenseSet<uint32_t> CalleeEdgeContextIds(
1576 Node->CalleeEdges.front()->ContextIds);
1577 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1578 if (CheckEdges)
1580 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1581 }
1582 // If we are allowing recursive callsites and contexts this will be violated
1583 // for incompletely cloned recursive cycles, so skip the checking in that
1584 // case.
1586 NodeContextIds == CalleeEdgeContextIds);
1587 }
1588 // FIXME: Since this checking is only invoked under an option, we should
1589 // change the error checking from using assert to something that will trigger
1590 // an error on a release build.
1591#ifndef NDEBUG
1592 // Make sure we don't end up with duplicate edges between the same caller and
1593 // callee.
1595 for (const auto &E : Node->CalleeEdges)
1596 NodeSet.insert(E->Callee);
1597 assert(NodeSet.size() == Node->CalleeEdges.size());
1598#endif
1599}
1600
1601template <typename DerivedCCG, typename FuncTy, typename CallTy>
1602void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1603 assignStackNodesPostOrder(
1604 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1605 DenseMap<uint64_t, std::vector<CallContextInfo>>
1606 &StackIdToMatchingCalls,
1607 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1608 auto Inserted = Visited.insert(Node);
1609 if (!Inserted.second)
1610 return;
1611 // Post order traversal. Iterate over a copy since we may add nodes and
1612 // therefore new callers during the recursive call, invalidating any
1613 // iterator over the original edge vector. We don't need to process these
1614 // new nodes as they were already processed on creation.
1615 auto CallerEdges = Node->CallerEdges;
1616 for (auto &Edge : CallerEdges) {
1617 // Skip any that have been removed during the recursion.
1618 if (Edge->isRemoved()) {
1619 assert(!is_contained(Node->CallerEdges, Edge));
1620 continue;
1621 }
1622 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1623 CallToMatchingCall);
1624 }
1625
1626 // If this node's stack id is in the map, update the graph to contain new
1627 // nodes representing any inlining at interior callsites. Note we move the
1628 // associated context ids over to the new nodes.
1629
1630 // Ignore this node if it is for an allocation or we didn't record any
1631 // stack id lists ending at it.
1632 if (Node->IsAllocation ||
1633 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1634 return;
1635
1636 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1637 // Handle the simple case first. A single call with a single stack id.
1638 // In this case there is no need to create any new context nodes, simply
1639 // assign the context node for stack id to this Call.
1640 if (Calls.size() == 1) {
1641 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1642 if (Ids.size() == 1) {
1643 assert(SavedContextIds.empty());
1644 // It should be this Node
1645 assert(Node == getNodeForStackId(Ids[0]));
1646 if (Node->Recursive)
1647 return;
1648 Node->setCall(Call);
1649 NonAllocationCallToContextNodeMap[Call] = Node;
1650 NodeToCallingFunc[Node] = Func;
1651 return;
1652 }
1653 }
1654
1655#ifndef NDEBUG
1656 // Find the node for the last stack id, which should be the same
1657 // across all calls recorded for this id, and is this node's id.
1658 uint64_t LastId = Node->OrigStackOrAllocId;
1659 ContextNode *LastNode = getNodeForStackId(LastId);
1660 // We should only have kept stack ids that had nodes.
1661 assert(LastNode);
1662 assert(LastNode == Node);
1663#else
1664 ContextNode *LastNode = Node;
1665#endif
1666
1667 // Compute the last node's context ids once, as it is shared by all calls in
1668 // this entry.
1669 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1670
1671 [[maybe_unused]] bool PrevIterCreatedNode = false;
1672 bool CreatedNode = false;
1673 for (unsigned I = 0; I < Calls.size();
1674 I++, PrevIterCreatedNode = CreatedNode) {
1675 CreatedNode = false;
1676 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1677 // Skip any for which we didn't assign any ids, these don't get a node in
1678 // the graph.
1679 if (SavedContextIds.empty()) {
1680 // If this call has a matching call (located in the same function and
1681 // having the same stack ids), simply add it to the context node created
1682 // for its matching call earlier. These can be treated the same through
1683 // cloning and get updated at the same time.
1684 if (!CallToMatchingCall.contains(Call))
1685 continue;
1686 auto MatchingCall = CallToMatchingCall[Call];
1687 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1688 // This should only happen if we had a prior iteration, and it didn't
1689 // create a node because of the below recomputation of context ids
1690 // finding none remaining and continuing early.
1691 assert(I > 0 && !PrevIterCreatedNode);
1692 continue;
1693 }
1694 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1695 Call);
1696 continue;
1697 }
1698
1699 assert(LastId == Ids.back());
1700
1701 // Recompute the context ids for this stack id sequence (the
1702 // intersection of the context ids of the corresponding nodes).
1703 // Start with the ids we saved in the map for this call, which could be
1704 // duplicated context ids. We have to recompute as we might have overlap
1705 // overlap between the saved context ids for different last nodes, and
1706 // removed them already during the post order traversal.
1707 set_intersect(SavedContextIds, LastNodeContextIds);
1708 ContextNode *PrevNode = LastNode;
1709 bool Skip = false;
1710 // Iterate backwards through the stack Ids, starting after the last Id
1711 // in the list, which was handled once outside for all Calls.
1712 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1713 auto Id = *IdIter;
1714 ContextNode *CurNode = getNodeForStackId(Id);
1715 // We should only have kept stack ids that had nodes and weren't
1716 // recursive.
1717 assert(CurNode);
1718 assert(!CurNode->Recursive);
1719
1720 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1721 if (!Edge) {
1722 Skip = true;
1723 break;
1724 }
1725 PrevNode = CurNode;
1726
1727 // Update the context ids, which is the intersection of the ids along
1728 // all edges in the sequence.
1729 set_intersect(SavedContextIds, Edge->getContextIds());
1730
1731 // If we now have no context ids for clone, skip this call.
1732 if (SavedContextIds.empty()) {
1733 Skip = true;
1734 break;
1735 }
1736 }
1737 if (Skip)
1738 continue;
1739
1740 // Create new context node.
1741 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1742 NonAllocationCallToContextNodeMap[Call] = NewNode;
1743 CreatedNode = true;
1744 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1745
1746 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1747 assert(FirstNode);
1748
1749 // Connect to callees of innermost stack frame in inlined call chain.
1750 // This updates context ids for FirstNode's callee's to reflect those
1751 // moved to NewNode.
1752 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1753
1754 // Connect to callers of outermost stack frame in inlined call chain.
1755 // This updates context ids for FirstNode's caller's to reflect those
1756 // moved to NewNode.
1757 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1758
1759 // Now we need to remove context ids from edges/nodes between First and
1760 // Last Node.
1761 PrevNode = nullptr;
1762 for (auto Id : Ids) {
1763 ContextNode *CurNode = getNodeForStackId(Id);
1764 // We should only have kept stack ids that had nodes.
1765 assert(CurNode);
1766
1767 // Remove the context ids moved to NewNode from CurNode, and the
1768 // edge from the prior node.
1769 if (PrevNode) {
1770 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1771 // If the sequence contained recursion, we might have already removed
1772 // some edges during the connectNewNode calls above.
1773 if (!PrevEdge) {
1774 PrevNode = CurNode;
1775 continue;
1776 }
1777 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1778 if (PrevEdge->getContextIds().empty())
1779 removeEdgeFromGraph(PrevEdge);
1780 }
1781 // Since we update the edges from leaf to tail, only look at the callee
1782 // edges. This isn't an alloc node, so if there are no callee edges, the
1783 // alloc type is None.
1784 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1785 ? (uint8_t)AllocationType::None
1786 : CurNode->computeAllocType();
1787 PrevNode = CurNode;
1788 }
1789 if (VerifyNodes) {
1790 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1791 for (auto Id : Ids) {
1792 ContextNode *CurNode = getNodeForStackId(Id);
1793 // We should only have kept stack ids that had nodes.
1794 assert(CurNode);
1795 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1796 }
1797 }
1798 }
1799}
1800
1801template <typename DerivedCCG, typename FuncTy, typename CallTy>
1802void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1803 // Map of stack id to all calls with that as the last (outermost caller)
1804 // callsite id that has a context node (some might not due to pruning
1805 // performed during matching of the allocation profile contexts).
1806 // The CallContextInfo contains the Call and a list of its stack ids with
1807 // ContextNodes, the function containing Call, and the set of context ids
1808 // the analysis will eventually identify for use in any new node created
1809 // for that callsite.
1810 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1811 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1812 for (auto &Call : CallsWithMetadata) {
1813 // Ignore allocations, already handled.
1814 if (AllocationCallToContextNodeMap.count(Call))
1815 continue;
1816 auto StackIdsWithContextNodes =
1817 getStackIdsWithContextNodesForCall(Call.call());
1818 // If there were no nodes created for MIBs on allocs (maybe this was in
1819 // the unambiguous part of the MIB stack that was pruned), ignore.
1820 if (StackIdsWithContextNodes.empty())
1821 continue;
1822 // Otherwise, record this Call along with the list of ids for the last
1823 // (outermost caller) stack id with a node.
1824 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1825 {Call.call(), StackIdsWithContextNodes, Func, {}});
1826 }
1827 }
1828
1829 // First make a pass through all stack ids that correspond to a call,
1830 // as identified in the above loop. Compute the context ids corresponding to
1831 // each of these calls when they correspond to multiple stack ids due to
1832 // due to inlining. Perform any duplication of context ids required when
1833 // there is more than one call with the same stack ids. Their (possibly newly
1834 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1835 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1836 // Save a map from each call to any that are found to match it. I.e. located
1837 // in the same function and have the same (possibly pruned) stack ids. We use
1838 // this to avoid creating extra graph nodes as they can be treated the same.
1839 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1840 for (auto &It : StackIdToMatchingCalls) {
1841 auto &Calls = It.getSecond();
1842 // Skip single calls with a single stack id. These don't need a new node.
1843 if (Calls.size() == 1) {
1844 auto &Ids = Calls[0].StackIds;
1845 if (Ids.size() == 1)
1846 continue;
1847 }
1848 // In order to do the best and maximal matching of inlined calls to context
1849 // node sequences we will sort the vectors of stack ids in descending order
1850 // of length, and within each length, lexicographically by stack id. The
1851 // latter is so that we can specially handle calls that have identical stack
1852 // id sequences (either due to cloning or artificially because of the MIB
1853 // context pruning). Those with the same Ids are then sorted by function to
1854 // facilitate efficiently mapping them to the same context node.
1855 // Because the functions are pointers, to ensure a stable sort first assign
1856 // each function pointer to its first index in the Calls array, and then use
1857 // that to sort by.
1858 DenseMap<const FuncTy *, unsigned> FuncToIndex;
1859 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1860 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1862 Calls,
1863 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1864 return A.StackIds.size() > B.StackIds.size() ||
1865 (A.StackIds.size() == B.StackIds.size() &&
1866 (A.StackIds < B.StackIds ||
1867 (A.StackIds == B.StackIds &&
1868 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1869 });
1870
1871 // Find the node for the last stack id, which should be the same
1872 // across all calls recorded for this id, and is the id for this
1873 // entry in the StackIdToMatchingCalls map.
1874 uint64_t LastId = It.getFirst();
1875 ContextNode *LastNode = getNodeForStackId(LastId);
1876 // We should only have kept stack ids that had nodes.
1877 assert(LastNode);
1878
1879 if (LastNode->Recursive)
1880 continue;
1881
1882 // Initialize the context ids with the last node's. We will subsequently
1883 // refine the context ids by computing the intersection along all edges.
1884 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1885 assert(!LastNodeContextIds.empty());
1886
1887#ifndef NDEBUG
1888 // Save the set of functions seen for a particular set of the same stack
1889 // ids. This is used to ensure that they have been correctly sorted to be
1890 // adjacent in the Calls list, since we rely on that to efficiently place
1891 // all such matching calls onto the same context node.
1892 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1893#endif
1894
1895 for (unsigned I = 0; I < Calls.size(); I++) {
1896 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1897 assert(SavedContextIds.empty());
1898 assert(LastId == Ids.back());
1899
1900#ifndef NDEBUG
1901 // If this call has a different set of ids than the last one, clear the
1902 // set used to ensure they are sorted properly.
1903 if (I > 0 && Ids != Calls[I - 1].StackIds)
1904 MatchingIdsFuncSet.clear();
1905#endif
1906
1907 // First compute the context ids for this stack id sequence (the
1908 // intersection of the context ids of the corresponding nodes).
1909 // Start with the remaining saved ids for the last node.
1910 assert(!LastNodeContextIds.empty());
1911 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1912
1913 ContextNode *PrevNode = LastNode;
1914 ContextNode *CurNode = LastNode;
1915 bool Skip = false;
1916
1917 // Iterate backwards through the stack Ids, starting after the last Id
1918 // in the list, which was handled once outside for all Calls.
1919 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1920 auto Id = *IdIter;
1921 CurNode = getNodeForStackId(Id);
1922 // We should only have kept stack ids that had nodes.
1923 assert(CurNode);
1924
1925 if (CurNode->Recursive) {
1926 Skip = true;
1927 break;
1928 }
1929
1930 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1931 // If there is no edge then the nodes belong to different MIB contexts,
1932 // and we should skip this inlined context sequence. For example, this
1933 // particular inlined context may include stack ids A->B, and we may
1934 // indeed have nodes for both A and B, but it is possible that they were
1935 // never profiled in sequence in a single MIB for any allocation (i.e.
1936 // we might have profiled an allocation that involves the callsite A,
1937 // but through a different one of its callee callsites, and we might
1938 // have profiled an allocation that involves callsite B, but reached
1939 // from a different caller callsite).
1940 if (!Edge) {
1941 Skip = true;
1942 break;
1943 }
1944 PrevNode = CurNode;
1945
1946 // Update the context ids, which is the intersection of the ids along
1947 // all edges in the sequence.
1948 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1949
1950 // If we now have no context ids for clone, skip this call.
1951 if (StackSequenceContextIds.empty()) {
1952 Skip = true;
1953 break;
1954 }
1955 }
1956 if (Skip)
1957 continue;
1958
1959 // If some of this call's stack ids did not have corresponding nodes (due
1960 // to pruning), don't include any context ids for contexts that extend
1961 // beyond these nodes. Otherwise we would be matching part of unrelated /
1962 // not fully matching stack contexts. To do this, subtract any context ids
1963 // found in caller nodes of the last node found above.
1964 if (Ids.back() != getLastStackId(Call)) {
1965 for (const auto &PE : LastNode->CallerEdges) {
1966 set_subtract(StackSequenceContextIds, PE->getContextIds());
1967 if (StackSequenceContextIds.empty())
1968 break;
1969 }
1970 // If we now have no context ids for clone, skip this call.
1971 if (StackSequenceContextIds.empty())
1972 continue;
1973 }
1974
1975#ifndef NDEBUG
1976 // If the prior call had the same stack ids this set would not be empty.
1977 // Check if we already have a call that "matches" because it is located
1978 // in the same function. If the Calls list was sorted properly we should
1979 // not encounter this situation as all such entries should be adjacent
1980 // and processed in bulk further below.
1981 assert(!MatchingIdsFuncSet.contains(Func));
1982
1983 MatchingIdsFuncSet.insert(Func);
1984#endif
1985
1986 // Check if the next set of stack ids is the same (since the Calls vector
1987 // of tuples is sorted by the stack ids we can just look at the next one).
1988 // If so, save them in the CallToMatchingCall map so that they get
1989 // assigned to the same context node, and skip them.
1990 bool DuplicateContextIds = false;
1991 for (unsigned J = I + 1; J < Calls.size(); J++) {
1992 auto &CallCtxInfo = Calls[J];
1993 auto &NextIds = CallCtxInfo.StackIds;
1994 if (NextIds != Ids)
1995 break;
1996 auto *NextFunc = CallCtxInfo.Func;
1997 if (NextFunc != Func) {
1998 // We have another Call with the same ids but that cannot share this
1999 // node, must duplicate ids for it.
2000 DuplicateContextIds = true;
2001 break;
2002 }
2003 auto &NextCall = CallCtxInfo.Call;
2004 CallToMatchingCall[NextCall] = Call;
2005 // Update I so that it gets incremented correctly to skip this call.
2006 I = J;
2007 }
2008
2009 // If we don't have duplicate context ids, then we can assign all the
2010 // context ids computed for the original node sequence to this call.
2011 // If there are duplicate calls with the same stack ids then we synthesize
2012 // new context ids that are duplicates of the originals. These are
2013 // assigned to SavedContextIds, which is a reference into the map entry
2014 // for this call, allowing us to access these ids later on.
2015 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2016 StackSequenceContextIds.size());
2017 SavedContextIds =
2018 DuplicateContextIds
2019 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2020 : StackSequenceContextIds;
2021 assert(!SavedContextIds.empty());
2022
2023 if (!DuplicateContextIds) {
2024 // Update saved last node's context ids to remove those that are
2025 // assigned to other calls, so that it is ready for the next call at
2026 // this stack id.
2027 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2028 if (LastNodeContextIds.empty())
2029 break;
2030 }
2031 }
2032 }
2033
2034 // Propagate the duplicate context ids over the graph.
2035 propagateDuplicateContextIds(OldToNewContextIds);
2036
2037 if (VerifyCCG)
2038 check();
2039
2040 // Now perform a post-order traversal over the graph, starting with the
2041 // allocation nodes, essentially processing nodes from callers to callees.
2042 // For any that contains an id in the map, update the graph to contain new
2043 // nodes representing any inlining at interior callsites. Note we move the
2044 // associated context ids over to the new nodes.
2045 DenseSet<const ContextNode *> Visited;
2046 for (auto &Entry : AllocationCallToContextNodeMap)
2047 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2048 CallToMatchingCall);
2049 if (VerifyCCG)
2050 check();
2051}
2052
2053uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2054 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2055 Call->getMetadata(LLVMContext::MD_callsite));
2056 return CallsiteContext.back();
2057}
2058
2059uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2061 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2062 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2063 // Need to convert index into stack id.
2064 return Index.getStackIdAtIndex(CallsiteContext.back());
2065}
2066
2067static const std::string MemProfCloneSuffix = ".memprof.";
2068
2069static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2070 // We use CloneNo == 0 to refer to the original version, which doesn't get
2071 // renamed with a suffix.
2072 if (!CloneNo)
2073 return Base.str();
2074 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2075}
2076
2077static bool isMemProfClone(const Function &F) {
2078 return F.getName().contains(MemProfCloneSuffix);
2079}
2080
2081// Return the clone number of the given function by extracting it from the
2082// memprof suffix. Assumes the caller has already confirmed it is a memprof
2083// clone.
2084static unsigned getMemProfCloneNum(const Function &F) {
2086 auto Pos = F.getName().find_last_of('.');
2087 assert(Pos > 0);
2088 unsigned CloneNo;
2089 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2090 assert(!Err);
2091 (void)Err;
2092 return CloneNo;
2093}
2094
2095std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2096 const Instruction *Call,
2097 unsigned CloneNo) const {
2098 return (Twine(Call->getFunction()->getName()) + " -> " +
2099 cast<CallBase>(Call)->getCalledFunction()->getName())
2100 .str();
2101}
2102
2103std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2104 const IndexCall &Call,
2105 unsigned CloneNo) const {
2106 auto VI = FSToVIMap.find(Func);
2107 assert(VI != FSToVIMap.end());
2108 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2110 return CallerName + " -> alloc";
2111 else {
2112 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2113 return CallerName + " -> " +
2114 getMemProfFuncName(Callsite->Callee.name(),
2115 Callsite->Clones[CloneNo]);
2116 }
2117}
2118
2119std::vector<uint64_t>
2120ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2121 Instruction *Call) {
2122 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2123 Call->getMetadata(LLVMContext::MD_callsite));
2124 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2125 CallsiteContext);
2126}
2127
2128std::vector<uint64_t>
2129IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2131 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2132 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2133 return getStackIdsWithContextNodes<CallsiteInfo,
2134 SmallVector<unsigned>::const_iterator>(
2135 CallsiteContext);
2136}
2137
2138template <typename DerivedCCG, typename FuncTy, typename CallTy>
2139template <class NodeT, class IteratorT>
2140std::vector<uint64_t>
2141CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2142 CallStack<NodeT, IteratorT> &CallsiteContext) {
2143 std::vector<uint64_t> StackIds;
2144 for (auto IdOrIndex : CallsiteContext) {
2145 auto StackId = getStackId(IdOrIndex);
2146 ContextNode *Node = getNodeForStackId(StackId);
2147 if (!Node)
2148 break;
2149 StackIds.push_back(StackId);
2150 }
2151 return StackIds;
2152}
2153
2154ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2155 Module &M,
2156 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2157 : Mod(M), OREGetter(OREGetter) {
2158 for (auto &F : M) {
2159 std::vector<CallInfo> CallsWithMetadata;
2160 for (auto &BB : F) {
2161 for (auto &I : BB) {
2162 if (!isa<CallBase>(I))
2163 continue;
2164 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2165 CallsWithMetadata.push_back(&I);
2166 auto *AllocNode = addAllocNode(&I, &F);
2167 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2168 assert(CallsiteMD);
2169 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2170 // Add all of the MIBs and their stack nodes.
2171 for (auto &MDOp : MemProfMD->operands()) {
2172 auto *MIBMD = cast<const MDNode>(MDOp);
2173 std::vector<ContextTotalSize> ContextSizeInfo;
2174 // Collect the context size information if it exists.
2175 if (MIBMD->getNumOperands() > 2) {
2176 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2177 MDNode *ContextSizePair =
2178 dyn_cast<MDNode>(MIBMD->getOperand(I));
2179 assert(ContextSizePair->getNumOperands() == 2);
2181 ContextSizePair->getOperand(0))
2182 ->getZExtValue();
2184 ContextSizePair->getOperand(1))
2185 ->getZExtValue();
2186 ContextSizeInfo.push_back({FullStackId, TotalSize});
2187 }
2188 }
2192 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2193 AllocNode, StackContext, CallsiteContext,
2194 getMIBAllocType(MIBMD), ContextSizeInfo);
2195 }
2196 // If exporting the graph to dot and an allocation id of interest was
2197 // specified, record all the context ids for this allocation node.
2198 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2199 DotAllocContextIds = AllocNode->getContextIds();
2200 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2201 // Memprof and callsite metadata on memory allocations no longer
2202 // needed.
2203 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2204 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2205 }
2206 // For callsite metadata, add to list for this function for later use.
2207 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2208 CallsWithMetadata.push_back(&I);
2209 }
2210 }
2211 }
2212 if (!CallsWithMetadata.empty())
2213 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2214 }
2215
2216 if (DumpCCG) {
2217 dbgs() << "CCG before updating call stack chains:\n";
2218 dbgs() << *this;
2219 }
2220
2221 if (ExportToDot)
2222 exportToDot("prestackupdate");
2223
2224 updateStackNodes();
2225
2226 if (ExportToDot)
2227 exportToDot("poststackupdate");
2228
2229 handleCallsitesWithMultipleTargets();
2230
2231 markBackedges();
2232
2233 // Strip off remaining callsite metadata, no longer needed.
2234 for (auto &FuncEntry : FuncToCallsWithMetadata)
2235 for (auto &Call : FuncEntry.second)
2236 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2237}
2238
2239IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2240 ModuleSummaryIndex &Index,
2242 isPrevailing)
2243 : Index(Index), isPrevailing(isPrevailing) {
2244 for (auto &I : Index) {
2245 auto VI = Index.getValueInfo(I);
2246 for (auto &S : VI.getSummaryList()) {
2247 // We should only add the prevailing nodes. Otherwise we may try to clone
2248 // in a weak copy that won't be linked (and may be different than the
2249 // prevailing version).
2250 // We only keep the memprof summary on the prevailing copy now when
2251 // building the combined index, as a space optimization, however don't
2252 // rely on this optimization. The linker doesn't resolve local linkage
2253 // values so don't check whether those are prevailing.
2254 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2255 !isPrevailing(VI.getGUID(), S.get()))
2256 continue;
2257 auto *FS = dyn_cast<FunctionSummary>(S.get());
2258 if (!FS)
2259 continue;
2260 std::vector<CallInfo> CallsWithMetadata;
2261 if (!FS->allocs().empty()) {
2262 for (auto &AN : FS->mutableAllocs()) {
2263 // This can happen because of recursion elimination handling that
2264 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2265 // We still added them to the summary because we need to be able to
2266 // correlate properly in applyImport in the backends.
2267 if (AN.MIBs.empty())
2268 continue;
2269 IndexCall AllocCall(&AN);
2270 CallsWithMetadata.push_back(AllocCall);
2271 auto *AllocNode = addAllocNode(AllocCall, FS);
2272 // Pass an empty CallStack to the CallsiteContext (second)
2273 // parameter, since for ThinLTO we already collapsed out the inlined
2274 // stack ids on the allocation call during ModuleSummaryAnalysis.
2276 EmptyContext;
2277 unsigned I = 0;
2279 AN.ContextSizeInfos.size() == AN.MIBs.size());
2280 // Now add all of the MIBs and their stack nodes.
2281 for (auto &MIB : AN.MIBs) {
2283 StackContext(&MIB);
2284 std::vector<ContextTotalSize> ContextSizeInfo;
2285 if (!AN.ContextSizeInfos.empty()) {
2286 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2287 ContextSizeInfo.push_back({FullStackId, TotalSize});
2288 }
2289 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2290 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2291 ContextSizeInfo);
2292 I++;
2293 }
2294 // If exporting the graph to dot and an allocation id of interest was
2295 // specified, record all the context ids for this allocation node.
2296 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2297 DotAllocContextIds = AllocNode->getContextIds();
2298 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2299 // Initialize version 0 on the summary alloc node to the current alloc
2300 // type, unless it has both types in which case make it default, so
2301 // that in the case where we aren't able to clone the original version
2302 // always ends up with the default allocation behavior.
2303 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2304 }
2305 }
2306 // For callsite metadata, add to list for this function for later use.
2307 if (!FS->callsites().empty())
2308 for (auto &SN : FS->mutableCallsites()) {
2309 IndexCall StackNodeCall(&SN);
2310 CallsWithMetadata.push_back(StackNodeCall);
2311 }
2312
2313 if (!CallsWithMetadata.empty())
2314 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2315
2316 if (!FS->allocs().empty() || !FS->callsites().empty())
2317 FSToVIMap[FS] = VI;
2318 }
2319 }
2320
2321 if (DumpCCG) {
2322 dbgs() << "CCG before updating call stack chains:\n";
2323 dbgs() << *this;
2324 }
2325
2326 if (ExportToDot)
2327 exportToDot("prestackupdate");
2328
2329 updateStackNodes();
2330
2331 if (ExportToDot)
2332 exportToDot("poststackupdate");
2333
2334 handleCallsitesWithMultipleTargets();
2335
2336 markBackedges();
2337}
2338
2339template <typename DerivedCCG, typename FuncTy, typename CallTy>
2340void CallsiteContextGraph<DerivedCCG, FuncTy,
2341 CallTy>::handleCallsitesWithMultipleTargets() {
2342 // Look for and workaround callsites that call multiple functions.
2343 // This can happen for indirect calls, which needs better handling, and in
2344 // more rare cases (e.g. macro expansion).
2345 // TODO: To fix this for indirect calls we will want to perform speculative
2346 // devirtualization using either the normal PGO info with ICP, or using the
2347 // information in the profiled MemProf contexts. We can do this prior to
2348 // this transformation for regular LTO, and for ThinLTO we can simulate that
2349 // effect in the summary and perform the actual speculative devirtualization
2350 // while cloning in the ThinLTO backend.
2351
2352 // Keep track of the new nodes synthesized for discovered tail calls missing
2353 // from the profiled contexts.
2354 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2355
2356 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2357 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2358 auto *Node = Entry.second;
2359 assert(Node->Clones.empty());
2360 // Check all node callees and see if in the same function.
2361 // We need to check all of the calls recorded in this Node, because in some
2362 // cases we may have had multiple calls with the same debug info calling
2363 // different callees. This can happen, for example, when an object is
2364 // constructed in the paramter list - the destructor call of the object has
2365 // the same debug info (line/col) as the call the object was passed to.
2366 // Here we will prune any that don't match all callee nodes.
2367 std::vector<CallInfo> AllCalls;
2368 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2369 AllCalls.push_back(Node->Call);
2370 llvm::append_range(AllCalls, Node->MatchingCalls);
2371
2372 // First see if we can partition the calls by callee function, creating new
2373 // nodes to host each set of calls calling the same callees. This is
2374 // necessary for support indirect calls with ThinLTO, for which we
2375 // synthesized CallsiteInfo records for each target. They will all have the
2376 // same callsite stack ids and would be sharing a context node at this
2377 // point. We need to perform separate cloning for each, which will be
2378 // applied along with speculative devirtualization in the ThinLTO backends
2379 // as needed. Note this does not currently support looking through tail
2380 // calls, it is unclear if we need that for indirect call targets.
2381 // First partition calls by callee func. Map indexed by func, value is
2382 // struct with list of matching calls, assigned node.
2383 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2384 continue;
2385
2386 auto It = AllCalls.begin();
2387 // Iterate through the calls until we find the first that matches.
2388 for (; It != AllCalls.end(); ++It) {
2389 auto ThisCall = *It;
2390 bool Match = true;
2391 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2392 ++EI) {
2393 auto Edge = *EI;
2394 if (!Edge->Callee->hasCall())
2395 continue;
2396 assert(NodeToCallingFunc.count(Edge->Callee));
2397 // Check if the called function matches that of the callee node.
2398 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2399 Match = false;
2400 break;
2401 }
2402 }
2403 // Found a call that matches the callee nodes, we can quit now.
2404 if (Match) {
2405 // If the first match is not the primary call on the Node, update it
2406 // now. We will update the list of matching calls further below.
2407 if (Node->Call != ThisCall) {
2408 Node->setCall(ThisCall);
2409 // We need to update the NonAllocationCallToContextNodeMap, but don't
2410 // want to do this during iteration over that map, so save the calls
2411 // that need updated entries.
2412 NewCallToNode.push_back({ThisCall, Node});
2413 }
2414 break;
2415 }
2416 }
2417 // We will update this list below (or leave it cleared if there was no
2418 // match found above).
2419 Node->MatchingCalls.clear();
2420 // If we hit the end of the AllCalls vector, no call matching the callee
2421 // nodes was found, clear the call information in the node.
2422 if (It == AllCalls.end()) {
2423 RemovedEdgesWithMismatchedCallees++;
2424 // Work around by setting Node to have a null call, so it gets
2425 // skipped during cloning. Otherwise assignFunctions will assert
2426 // because its data structures are not designed to handle this case.
2427 Node->setCall(CallInfo());
2428 continue;
2429 }
2430 // Now add back any matching calls that call the same function as the
2431 // matching primary call on Node.
2432 for (++It; It != AllCalls.end(); ++It) {
2433 auto ThisCall = *It;
2434 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2435 continue;
2436 Node->MatchingCalls.push_back(ThisCall);
2437 }
2438 }
2439
2440 // Remove all mismatched nodes identified in the above loop from the node map
2441 // (checking whether they have a null call which is set above). For a
2442 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2443 // to do the removal via remove_if than by individually erasing entries above.
2444 // Also remove any entries if we updated the node's primary call above.
2445 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2446 return !it.second->hasCall() || it.second->Call != it.first;
2447 });
2448
2449 // Add entries for any new primary calls recorded above.
2450 for (auto &[Call, Node] : NewCallToNode)
2451 NonAllocationCallToContextNodeMap[Call] = Node;
2452
2453 // Add the new nodes after the above loop so that the iteration is not
2454 // invalidated.
2455 for (auto &[Call, Node] : TailCallToContextNodeMap)
2456 NonAllocationCallToContextNodeMap[Call] = Node;
2457}
2458
2459template <typename DerivedCCG, typename FuncTy, typename CallTy>
2460bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2461 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2462 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2463 // Struct to keep track of all the calls having the same callee function,
2464 // and the node we eventually assign to them. Eventually we will record the
2465 // context node assigned to this group of calls.
2466 struct CallsWithSameCallee {
2467 std::vector<CallInfo> Calls;
2468 ContextNode *Node = nullptr;
2469 };
2470
2471 // First partition calls by callee function. Build map from each function
2472 // to the list of matching calls.
2474 for (auto ThisCall : AllCalls) {
2475 auto *F = getCalleeFunc(ThisCall.call());
2476 if (F)
2477 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2478 }
2479
2480 // Next, walk through all callee edges. For each callee node, get its
2481 // containing function and see if it was recorded in the above map (meaning we
2482 // have at least one matching call). Build another map from each callee node
2483 // with a matching call to the structure instance created above containing all
2484 // the calls.
2486 for (const auto &Edge : Node->CalleeEdges) {
2487 if (!Edge->Callee->hasCall())
2488 continue;
2489 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2490 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2491 CalleeNodeToCallInfo[Edge->Callee] =
2492 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2493 }
2494
2495 // If there are entries in the second map, then there were no matching
2496 // calls/callees, nothing to do here. Return so we can go to the handling that
2497 // looks through tail calls.
2498 if (CalleeNodeToCallInfo.empty())
2499 return false;
2500
2501 // Walk through all callee edges again. Any and all callee edges that didn't
2502 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2503 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2504 // ignored during cloning. If it is in the map, then we use the node recorded
2505 // in that entry (creating it if needed), and move the callee edge to it.
2506 // The first callee will use the original node instead of creating a new one.
2507 // Note that any of the original calls on this node (in AllCalls) that didn't
2508 // have a callee function automatically get dropped from the node as part of
2509 // this process.
2510 ContextNode *UnmatchedCalleesNode = nullptr;
2511 // Track whether we already assigned original node to a callee.
2512 bool UsedOrigNode = false;
2513 assert(NodeToCallingFunc[Node]);
2514 // Iterate over a copy of Node's callee edges, since we may need to remove
2515 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2516 // makes it less error-prone.
2517 auto CalleeEdges = Node->CalleeEdges;
2518 for (auto &Edge : CalleeEdges) {
2519 if (!Edge->Callee->hasCall())
2520 continue;
2521
2522 // Will be updated below to point to whatever (caller) node this callee edge
2523 // should be moved to.
2524 ContextNode *CallerNodeToUse = nullptr;
2525
2526 // Handle the case where there were no matching calls first. Move this
2527 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2528 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2529 if (!UnmatchedCalleesNode)
2530 UnmatchedCalleesNode =
2531 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2532 CallerNodeToUse = UnmatchedCalleesNode;
2533 } else {
2534 // Look up the information recorded for this callee node, and use the
2535 // recorded caller node (creating it if needed).
2536 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2537 if (!Info->Node) {
2538 // If we haven't assigned any callees to the original node use it.
2539 if (!UsedOrigNode) {
2540 Info->Node = Node;
2541 // Clear the set of matching calls which will be updated below.
2542 Node->MatchingCalls.clear();
2543 UsedOrigNode = true;
2544 } else
2545 Info->Node =
2546 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2547 assert(!Info->Calls.empty());
2548 // The first call becomes the primary call for this caller node, and the
2549 // rest go in the matching calls list.
2550 Info->Node->setCall(Info->Calls.front());
2551 llvm::append_range(Info->Node->MatchingCalls,
2552 llvm::drop_begin(Info->Calls));
2553 // Save the primary call to node correspondence so that we can update
2554 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2555 // caller of this function.
2556 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2557 }
2558 CallerNodeToUse = Info->Node;
2559 }
2560
2561 // Don't need to move edge if we are using the original node;
2562 if (CallerNodeToUse == Node)
2563 continue;
2564
2565 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2566 }
2567 // Now that we are done moving edges, clean up any caller edges that ended
2568 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2569 // caller edges from Node are replicated onto the new callers, and it
2570 // simplifies the handling to leave them until we have moved all
2571 // edges/context ids.
2572 for (auto &I : CalleeNodeToCallInfo)
2573 removeNoneTypeCallerEdges(I.second->Node);
2574 if (UnmatchedCalleesNode)
2575 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2576 removeNoneTypeCallerEdges(Node);
2577
2578 return true;
2579}
2580
2581uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2582 // In the Module (IR) case this is already the Id.
2583 return IdOrIndex;
2584}
2585
2586uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2587 // In the Index case this is an index into the stack id list in the summary
2588 // index, convert it to an Id.
2589 return Index.getStackIdAtIndex(IdOrIndex);
2590}
2591
2592template <typename DerivedCCG, typename FuncTy, typename CallTy>
2593bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2594 CallTy Call, EdgeIter &EI,
2595 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2596 auto Edge = *EI;
2597 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2598 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2599 // Will be populated in order of callee to caller if we find a chain of tail
2600 // calls between the profiled caller and callee.
2601 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2602 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2603 FoundCalleeChain))
2604 return false;
2605
2606 // The usual case where the profiled callee matches that of the IR/summary.
2607 if (FoundCalleeChain.empty())
2608 return true;
2609
2610 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2611 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2612 // If there is already an edge between these nodes, simply update it and
2613 // return.
2614 if (CurEdge) {
2615 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2616 CurEdge->AllocTypes |= Edge->AllocTypes;
2617 return;
2618 }
2619 // Otherwise, create a new edge and insert it into the caller and callee
2620 // lists.
2621 auto NewEdge = std::make_shared<ContextEdge>(
2622 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2623 Callee->CallerEdges.push_back(NewEdge);
2624 if (Caller == Edge->Caller) {
2625 // If we are inserting the new edge into the current edge's caller, insert
2626 // the new edge before the current iterator position, and then increment
2627 // back to the current edge.
2628 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2629 ++EI;
2630 assert(*EI == Edge &&
2631 "Iterator position not restored after insert and increment");
2632 } else
2633 Caller->CalleeEdges.push_back(NewEdge);
2634 };
2635
2636 // Create new nodes for each found callee and connect in between the profiled
2637 // caller and callee.
2638 auto *CurCalleeNode = Edge->Callee;
2639 for (auto &[NewCall, Func] : FoundCalleeChain) {
2640 ContextNode *NewNode = nullptr;
2641 // First check if we have already synthesized a node for this tail call.
2642 if (TailCallToContextNodeMap.count(NewCall)) {
2643 NewNode = TailCallToContextNodeMap[NewCall];
2644 NewNode->AllocTypes |= Edge->AllocTypes;
2645 } else {
2646 FuncToCallsWithMetadata[Func].push_back({NewCall});
2647 // Create Node and record node info.
2648 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2649 TailCallToContextNodeMap[NewCall] = NewNode;
2650 NewNode->AllocTypes = Edge->AllocTypes;
2651 }
2652
2653 // Hook up node to its callee node
2654 AddEdge(NewNode, CurCalleeNode);
2655
2656 CurCalleeNode = NewNode;
2657 }
2658
2659 // Hook up edge's original caller to new callee node.
2660 AddEdge(Edge->Caller, CurCalleeNode);
2661
2662#ifndef NDEBUG
2663 // Save this because Edge's fields get cleared below when removed.
2664 auto *Caller = Edge->Caller;
2665#endif
2666
2667 // Remove old edge
2668 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2669
2670 // To simplify the increment of EI in the caller, subtract one from EI.
2671 // In the final AddEdge call we would have either added a new callee edge,
2672 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2673 // that there is at least one callee edge.
2674 assert(!Caller->CalleeEdges.empty());
2675 --EI;
2676
2677 return true;
2678}
2679
2680bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2681 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2682 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2683 bool &FoundMultipleCalleeChains) {
2684 // Stop recursive search if we have already explored the maximum specified
2685 // depth.
2687 return false;
2688
2689 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2690 FoundCalleeChain.push_back({Callsite, F});
2691 };
2692
2693 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2694 if (!CalleeFunc) {
2695 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2696 assert(Alias);
2697 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2698 assert(CalleeFunc);
2699 }
2700
2701 // Look for tail calls in this function, and check if they either call the
2702 // profiled callee directly, or indirectly (via a recursive search).
2703 // Only succeed if there is a single unique tail call chain found between the
2704 // profiled caller and callee, otherwise we could perform incorrect cloning.
2705 bool FoundSingleCalleeChain = false;
2706 for (auto &BB : *CalleeFunc) {
2707 for (auto &I : BB) {
2708 auto *CB = dyn_cast<CallBase>(&I);
2709 if (!CB || !CB->isTailCall())
2710 continue;
2711 auto *CalledValue = CB->getCalledOperand();
2712 auto *CalledFunction = CB->getCalledFunction();
2713 if (CalledValue && !CalledFunction) {
2714 CalledValue = CalledValue->stripPointerCasts();
2715 // Stripping pointer casts can reveal a called function.
2716 CalledFunction = dyn_cast<Function>(CalledValue);
2717 }
2718 // Check if this is an alias to a function. If so, get the
2719 // called aliasee for the checks below.
2720 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2721 assert(!CalledFunction &&
2722 "Expected null called function in callsite for alias");
2723 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2724 }
2725 if (!CalledFunction)
2726 continue;
2727 if (CalledFunction == ProfiledCallee) {
2728 if (FoundSingleCalleeChain) {
2729 FoundMultipleCalleeChains = true;
2730 return false;
2731 }
2732 FoundSingleCalleeChain = true;
2733 FoundProfiledCalleeCount++;
2734 FoundProfiledCalleeDepth += Depth;
2735 if (Depth > FoundProfiledCalleeMaxDepth)
2736 FoundProfiledCalleeMaxDepth = Depth;
2737 SaveCallsiteInfo(&I, CalleeFunc);
2738 } else if (findProfiledCalleeThroughTailCalls(
2739 ProfiledCallee, CalledFunction, Depth + 1,
2740 FoundCalleeChain, FoundMultipleCalleeChains)) {
2741 // findProfiledCalleeThroughTailCalls should not have returned
2742 // true if FoundMultipleCalleeChains.
2743 assert(!FoundMultipleCalleeChains);
2744 if (FoundSingleCalleeChain) {
2745 FoundMultipleCalleeChains = true;
2746 return false;
2747 }
2748 FoundSingleCalleeChain = true;
2749 SaveCallsiteInfo(&I, CalleeFunc);
2750 } else if (FoundMultipleCalleeChains)
2751 return false;
2752 }
2753 }
2754
2755 return FoundSingleCalleeChain;
2756}
2757
2758const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2759 auto *CB = dyn_cast<CallBase>(Call);
2760 if (!CB->getCalledOperand() || CB->isIndirectCall())
2761 return nullptr;
2762 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2763 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2764 if (Alias)
2765 return dyn_cast<Function>(Alias->getAliasee());
2766 return dyn_cast<Function>(CalleeVal);
2767}
2768
2769bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2770 Instruction *Call, const Function *Func, const Function *CallerFunc,
2771 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2772 auto *CB = dyn_cast<CallBase>(Call);
2773 if (!CB->getCalledOperand() || CB->isIndirectCall())
2774 return false;
2775 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2776 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2777 if (CalleeFunc == Func)
2778 return true;
2779 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2780 if (Alias && Alias->getAliasee() == Func)
2781 return true;
2782
2783 // Recursively search for the profiled callee through tail calls starting with
2784 // the actual Callee. The discovered tail call chain is saved in
2785 // FoundCalleeChain, and we will fixup the graph to include these callsites
2786 // after returning.
2787 // FIXME: We will currently redo the same recursive walk if we find the same
2788 // mismatched callee from another callsite. We can improve this with more
2789 // bookkeeping of the created chain of new nodes for each mismatch.
2790 unsigned Depth = 1;
2791 bool FoundMultipleCalleeChains = false;
2792 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2793 FoundCalleeChain,
2794 FoundMultipleCalleeChains)) {
2795 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2796 << Func->getName() << " from " << CallerFunc->getName()
2797 << " that actually called " << CalleeVal->getName()
2798 << (FoundMultipleCalleeChains
2799 ? " (found multiple possible chains)"
2800 : "")
2801 << "\n");
2802 if (FoundMultipleCalleeChains)
2803 FoundProfiledCalleeNonUniquelyCount++;
2804 return false;
2805 }
2806
2807 return true;
2808}
2809
2810bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2811 Instruction *Call2) {
2812 auto *CB1 = cast<CallBase>(Call1);
2813 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2814 return false;
2815 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2816 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2817 auto *CB2 = cast<CallBase>(Call2);
2818 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2819 return false;
2820 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2821 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2822 return CalleeFunc1 == CalleeFunc2;
2823}
2824
2825bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2826 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2827 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2828 bool &FoundMultipleCalleeChains) {
2829 // Stop recursive search if we have already explored the maximum specified
2830 // depth.
2832 return false;
2833
2834 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2835 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2836 // been synthesized.
2837 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2838 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2839 // StackIds is empty (we don't have debug info available in the index for
2840 // these callsites)
2841 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2842 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2843 CallsiteInfo *NewCallsiteInfo =
2844 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2845 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2846 };
2847
2848 // Look for tail calls in this function, and check if they either call the
2849 // profiled callee directly, or indirectly (via a recursive search).
2850 // Only succeed if there is a single unique tail call chain found between the
2851 // profiled caller and callee, otherwise we could perform incorrect cloning.
2852 bool FoundSingleCalleeChain = false;
2853 for (auto &S : CurCallee.getSummaryList()) {
2854 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2855 !isPrevailing(CurCallee.getGUID(), S.get()))
2856 continue;
2857 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2858 if (!FS)
2859 continue;
2860 auto FSVI = CurCallee;
2861 auto *AS = dyn_cast<AliasSummary>(S.get());
2862 if (AS)
2863 FSVI = AS->getAliaseeVI();
2864 for (auto &CallEdge : FS->calls()) {
2865 if (!CallEdge.second.hasTailCall())
2866 continue;
2867 if (CallEdge.first == ProfiledCallee) {
2868 if (FoundSingleCalleeChain) {
2869 FoundMultipleCalleeChains = true;
2870 return false;
2871 }
2872 FoundSingleCalleeChain = true;
2873 FoundProfiledCalleeCount++;
2874 FoundProfiledCalleeDepth += Depth;
2875 if (Depth > FoundProfiledCalleeMaxDepth)
2876 FoundProfiledCalleeMaxDepth = Depth;
2877 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2878 // Add FS to FSToVIMap in case it isn't already there.
2879 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2880 FSToVIMap[FS] = FSVI;
2881 } else if (findProfiledCalleeThroughTailCalls(
2882 ProfiledCallee, CallEdge.first, Depth + 1,
2883 FoundCalleeChain, FoundMultipleCalleeChains)) {
2884 // findProfiledCalleeThroughTailCalls should not have returned
2885 // true if FoundMultipleCalleeChains.
2886 assert(!FoundMultipleCalleeChains);
2887 if (FoundSingleCalleeChain) {
2888 FoundMultipleCalleeChains = true;
2889 return false;
2890 }
2891 FoundSingleCalleeChain = true;
2892 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2893 // Add FS to FSToVIMap in case it isn't already there.
2894 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2895 FSToVIMap[FS] = FSVI;
2896 } else if (FoundMultipleCalleeChains)
2897 return false;
2898 }
2899 }
2900
2901 return FoundSingleCalleeChain;
2902}
2903
2904const FunctionSummary *
2905IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2906 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2907 if (Callee.getSummaryList().empty())
2908 return nullptr;
2909 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2910}
2911
2912bool IndexCallsiteContextGraph::calleeMatchesFunc(
2913 IndexCall &Call, const FunctionSummary *Func,
2914 const FunctionSummary *CallerFunc,
2915 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2916 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2917 // If there is no summary list then this is a call to an externally defined
2918 // symbol.
2919 AliasSummary *Alias =
2920 Callee.getSummaryList().empty()
2921 ? nullptr
2922 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2923 assert(FSToVIMap.count(Func));
2924 auto FuncVI = FSToVIMap[Func];
2925 if (Callee == FuncVI ||
2926 // If callee is an alias, check the aliasee, since only function
2927 // summary base objects will contain the stack node summaries and thus
2928 // get a context node.
2929 (Alias && Alias->getAliaseeVI() == FuncVI))
2930 return true;
2931
2932 // Recursively search for the profiled callee through tail calls starting with
2933 // the actual Callee. The discovered tail call chain is saved in
2934 // FoundCalleeChain, and we will fixup the graph to include these callsites
2935 // after returning.
2936 // FIXME: We will currently redo the same recursive walk if we find the same
2937 // mismatched callee from another callsite. We can improve this with more
2938 // bookkeeping of the created chain of new nodes for each mismatch.
2939 unsigned Depth = 1;
2940 bool FoundMultipleCalleeChains = false;
2941 if (!findProfiledCalleeThroughTailCalls(
2942 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2943 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2944 << " from " << FSToVIMap[CallerFunc]
2945 << " that actually called " << Callee
2946 << (FoundMultipleCalleeChains
2947 ? " (found multiple possible chains)"
2948 : "")
2949 << "\n");
2950 if (FoundMultipleCalleeChains)
2951 FoundProfiledCalleeNonUniquelyCount++;
2952 return false;
2953 }
2954
2955 return true;
2956}
2957
2958bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2959 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2960 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2961 return Callee1 == Callee2;
2962}
2963
2964template <typename DerivedCCG, typename FuncTy, typename CallTy>
2965void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2966 const {
2967 print(dbgs());
2968 dbgs() << "\n";
2969}
2970
2971template <typename DerivedCCG, typename FuncTy, typename CallTy>
2972void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2973 raw_ostream &OS) const {
2974 OS << "Node " << this << "\n";
2975 OS << "\t";
2976 printCall(OS);
2977 if (Recursive)
2978 OS << " (recursive)";
2979 OS << "\n";
2980 if (!MatchingCalls.empty()) {
2981 OS << "\tMatchingCalls:\n";
2982 for (auto &MatchingCall : MatchingCalls) {
2983 OS << "\t";
2984 MatchingCall.print(OS);
2985 OS << "\n";
2986 }
2987 }
2988 OS << "\tNodeId: " << NodeId << "\n";
2989 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2990 OS << "\tContextIds:";
2991 // Make a copy of the computed context ids that we can sort for stability.
2992 auto ContextIds = getContextIds();
2993 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2994 std::sort(SortedIds.begin(), SortedIds.end());
2995 for (auto Id : SortedIds)
2996 OS << " " << Id;
2997 OS << "\n";
2998 OS << "\tCalleeEdges:\n";
2999 for (auto &Edge : CalleeEdges)
3000 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3001 << ")\n";
3002 OS << "\tCallerEdges:\n";
3003 for (auto &Edge : CallerEdges)
3004 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3005 << ")\n";
3006 if (!Clones.empty()) {
3007 OS << "\tClones: ";
3008 bool First = true;
3009 for (auto *C : Clones) {
3010 if (!First)
3011 OS << ", ";
3012 First = false;
3013 OS << C << " NodeId: " << C->NodeId;
3014 }
3015 OS << "\n";
3016 } else if (CloneOf) {
3017 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3018 }
3019}
3020
3021template <typename DerivedCCG, typename FuncTy, typename CallTy>
3022void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3023 const {
3024 print(dbgs());
3025 dbgs() << "\n";
3026}
3027
3028template <typename DerivedCCG, typename FuncTy, typename CallTy>
3029void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3030 raw_ostream &OS) const {
3031 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3032 << (IsBackedge ? " (BE)" : "")
3033 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3034 OS << " ContextIds:";
3035 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3036 std::sort(SortedIds.begin(), SortedIds.end());
3037 for (auto Id : SortedIds)
3038 OS << " " << Id;
3039}
3040
3041template <typename DerivedCCG, typename FuncTy, typename CallTy>
3042void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3043 print(dbgs());
3044}
3045
3046template <typename DerivedCCG, typename FuncTy, typename CallTy>
3047void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3048 raw_ostream &OS) const {
3049 OS << "Callsite Context Graph:\n";
3050 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3051 for (const auto Node : nodes<GraphType>(this)) {
3052 if (Node->isRemoved())
3053 continue;
3054 Node->print(OS);
3055 OS << "\n";
3056 }
3057}
3058
3059template <typename DerivedCCG, typename FuncTy, typename CallTy>
3060void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3061 raw_ostream &OS) const {
3062 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3063 for (const auto Node : nodes<GraphType>(this)) {
3064 if (Node->isRemoved())
3065 continue;
3066 if (!Node->IsAllocation)
3067 continue;
3068 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3069 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3070 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3071 std::sort(SortedIds.begin(), SortedIds.end());
3072 for (auto Id : SortedIds) {
3073 auto TypeI = ContextIdToAllocationType.find(Id);
3074 assert(TypeI != ContextIdToAllocationType.end());
3075 auto CSI = ContextIdToContextSizeInfos.find(Id);
3076 if (CSI != ContextIdToContextSizeInfos.end()) {
3077 for (auto &Info : CSI->second) {
3078 OS << "MemProf hinting: "
3079 << getAllocTypeString((uint8_t)TypeI->second)
3080 << " full allocation context " << Info.FullStackId
3081 << " with total size " << Info.TotalSize << " is "
3082 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3083 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3084 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3085 << " due to cold byte percent";
3086 // Print the internal context id to aid debugging and visualization.
3087 OS << " (context id " << Id << ")";
3088 OS << "\n";
3089 }
3090 }
3091 }
3092 }
3093}
3094
3095template <typename DerivedCCG, typename FuncTy, typename CallTy>
3096void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3097 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3098 for (const auto Node : nodes<GraphType>(this)) {
3099 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3100 for (auto &Edge : Node->CallerEdges)
3102 }
3103}
3104
3105template <typename DerivedCCG, typename FuncTy, typename CallTy>
3106struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3107 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3108 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3109
3110 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3111 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3112
3115 decltype(&getNode)>;
3116
3118 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3119 }
3120
3122 return nodes_iterator(G->NodeOwner.end(), &getNode);
3123 }
3124
3126 return G->NodeOwner.begin()->get();
3127 }
3128
3129 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3130 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3132 return P->Callee;
3133 }
3134
3136 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3137 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3138 decltype(&GetCallee)>;
3139
3141 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3142 }
3143
3145 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3146 }
3147};
3148
3149template <typename DerivedCCG, typename FuncTy, typename CallTy>
3150struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3151 : public DefaultDOTGraphTraits {
3152 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3153 // If the user requested the full graph to be exported, but provided an
3154 // allocation id, or if the user gave a context id and requested more than
3155 // just a specific context to be exported, note that highlighting is
3156 // enabled.
3157 DoHighlight =
3158 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3159 (ContextIdForDot.getNumOccurrences() &&
3161 }
3162
3163 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3165 using NodeRef = typename GTraits::NodeRef;
3166 using ChildIteratorType = typename GTraits::ChildIteratorType;
3167
3168 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3169 std::string LabelString =
3170 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3171 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3172 .str();
3173 LabelString += "\n";
3174 if (Node->hasCall()) {
3175 auto Func = G->NodeToCallingFunc.find(Node);
3176 assert(Func != G->NodeToCallingFunc.end());
3177 LabelString +=
3178 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3179 } else {
3180 LabelString += "null call";
3181 if (Node->Recursive)
3182 LabelString += " (recursive)";
3183 else
3184 LabelString += " (external)";
3185 }
3186 return LabelString;
3187 }
3188
3190 auto ContextIds = Node->getContextIds();
3191 // If highlighting enabled, see if this node contains any of the context ids
3192 // of interest. If so, it will use a different color and a larger fontsize
3193 // (which makes the node larger as well).
3194 bool Highlight = false;
3195 if (DoHighlight) {
3196 assert(ContextIdForDot.getNumOccurrences() ||
3197 AllocIdForDot.getNumOccurrences());
3198 if (ContextIdForDot.getNumOccurrences())
3199 Highlight = ContextIds.contains(ContextIdForDot);
3200 else
3201 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3202 }
3203 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3204 getContextIds(ContextIds) + "\"")
3205 .str();
3206 // Default fontsize is 14
3207 if (Highlight)
3208 AttributeString += ",fontsize=\"30\"";
3209 AttributeString +=
3210 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3211 .str();
3212 if (Node->CloneOf) {
3213 AttributeString += ",color=\"blue\"";
3214 AttributeString += ",style=\"filled,bold,dashed\"";
3215 } else
3216 AttributeString += ",style=\"filled\"";
3217 return AttributeString;
3218 }
3219
3220 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3221 GraphType G) {
3222 auto &Edge = *(ChildIter.getCurrent());
3223 // If highlighting enabled, see if this edge contains any of the context ids
3224 // of interest. If so, it will use a different color and a heavier arrow
3225 // size and weight (the larger weight makes the highlighted path
3226 // straighter).
3227 bool Highlight = false;
3228 if (DoHighlight) {
3229 assert(ContextIdForDot.getNumOccurrences() ||
3230 AllocIdForDot.getNumOccurrences());
3231 if (ContextIdForDot.getNumOccurrences())
3232 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3233 else
3234 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3235 }
3236 auto Color = getColor(Edge->AllocTypes, Highlight);
3237 std::string AttributeString =
3238 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3239 // fillcolor is the arrow head and color is the line
3240 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3241 "\"")
3242 .str();
3243 if (Edge->IsBackedge)
3244 AttributeString += ",style=\"dotted\"";
3245 // Default penwidth and weight are both 1.
3246 if (Highlight)
3247 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3248 return AttributeString;
3249 }
3250
3251 // Since the NodeOwners list includes nodes that are no longer connected to
3252 // the graph, skip them here.
3254 if (Node->isRemoved())
3255 return true;
3256 // If a scope smaller than the full graph was requested, see if this node
3257 // contains any of the context ids of interest.
3259 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3261 return !Node->getContextIds().contains(ContextIdForDot);
3262 return false;
3263 }
3264
3265private:
3266 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3267 std::string IdString = "ContextIds:";
3268 if (ContextIds.size() < 100) {
3269 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3270 std::sort(SortedIds.begin(), SortedIds.end());
3271 for (auto Id : SortedIds)
3272 IdString += (" " + Twine(Id)).str();
3273 } else {
3274 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3275 }
3276 return IdString;
3277 }
3278
3279 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3280 // If DoHighlight is not enabled, we want to use the highlight colors for
3281 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3282 // both compatible with the color scheme before highlighting was supported,
3283 // and for the NotCold+Cold color the non-highlight color is a bit more
3284 // readable.
3285 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3286 // Color "brown1" actually looks like a lighter red.
3287 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3288 if (AllocTypes == (uint8_t)AllocationType::Cold)
3289 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3290 if (AllocTypes ==
3291 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3292 return Highlight ? "magenta" : "mediumorchid1";
3293 return "gray";
3294 }
3295
3296 static std::string getNodeId(NodeRef Node) {
3297 std::stringstream SStream;
3298 SStream << std::hex << "N0x" << (unsigned long long)Node;
3299 std::string Result = SStream.str();
3300 return Result;
3301 }
3302
3303 // True if we should highlight a specific context or allocation's contexts in
3304 // the emitted graph.
3305 static bool DoHighlight;
3306};
3307
3308template <typename DerivedCCG, typename FuncTy, typename CallTy>
3309bool DOTGraphTraits<
3310 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3311 false;
3312
3313template <typename DerivedCCG, typename FuncTy, typename CallTy>
3314void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3315 std::string Label) const {
3316 WriteGraph(this, "", false, Label,
3317 DotFilePathPrefix + "ccg." + Label + ".dot");
3318}
3319
3320template <typename DerivedCCG, typename FuncTy, typename CallTy>
3321typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3322CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3323 const std::shared_ptr<ContextEdge> &Edge,
3324 DenseSet<uint32_t> ContextIdsToMove) {
3325 ContextNode *Node = Edge->Callee;
3326 assert(NodeToCallingFunc.count(Node));
3327 ContextNode *Clone =
3328 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3329 Node->addClone(Clone);
3330 Clone->MatchingCalls = Node->MatchingCalls;
3331 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3332 ContextIdsToMove);
3333 return Clone;
3334}
3335
3336template <typename DerivedCCG, typename FuncTy, typename CallTy>
3337void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3338 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3339 ContextNode *NewCallee, bool NewClone,
3340 DenseSet<uint32_t> ContextIdsToMove) {
3341 // NewCallee and Edge's current callee must be clones of the same original
3342 // node (Edge's current callee may be the original node too).
3343 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3344
3345 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3346
3347 ContextNode *OldCallee = Edge->Callee;
3348
3349 // We might already have an edge to the new callee from earlier cloning for a
3350 // different allocation. If one exists we will reuse it.
3351 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3352
3353 // Callers will pass an empty ContextIdsToMove set when they want to move the
3354 // edge. Copy in Edge's ids for simplicity.
3355 if (ContextIdsToMove.empty())
3356 ContextIdsToMove = Edge->getContextIds();
3357
3358 // If we are moving all of Edge's ids, then just move the whole Edge.
3359 // Otherwise only move the specified subset, to a new edge if needed.
3360 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3361 // First, update the alloc types on New Callee from Edge.
3362 // Do this before we potentially clear Edge's fields below!
3363 NewCallee->AllocTypes |= Edge->AllocTypes;
3364 // Moving the whole Edge.
3365 if (ExistingEdgeToNewCallee) {
3366 // Since we already have an edge to NewCallee, simply move the ids
3367 // onto it, and remove the existing Edge.
3368 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3369 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3370 assert(Edge->ContextIds == ContextIdsToMove);
3371 removeEdgeFromGraph(Edge.get());
3372 } else {
3373 // Otherwise just reconnect Edge to NewCallee.
3374 Edge->Callee = NewCallee;
3375 NewCallee->CallerEdges.push_back(Edge);
3376 // Remove it from callee where it was previously connected.
3377 OldCallee->eraseCallerEdge(Edge.get());
3378 // Don't need to update Edge's context ids since we are simply
3379 // reconnecting it.
3380 }
3381 } else {
3382 // Only moving a subset of Edge's ids.
3383 // Compute the alloc type of the subset of ids being moved.
3384 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3385 if (ExistingEdgeToNewCallee) {
3386 // Since we already have an edge to NewCallee, simply move the ids
3387 // onto it.
3388 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3389 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3390 } else {
3391 // Otherwise, create a new edge to NewCallee for the ids being moved.
3392 auto NewEdge = std::make_shared<ContextEdge>(
3393 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3394 Edge->Caller->CalleeEdges.push_back(NewEdge);
3395 NewCallee->CallerEdges.push_back(NewEdge);
3396 }
3397 // In either case, need to update the alloc types on NewCallee, and remove
3398 // those ids and update the alloc type on the original Edge.
3399 NewCallee->AllocTypes |= CallerEdgeAllocType;
3400 set_subtract(Edge->ContextIds, ContextIdsToMove);
3401 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3402 }
3403 // Now walk the old callee node's callee edges and move Edge's context ids
3404 // over to the corresponding edge into the clone (which is created here if
3405 // this is a newly created clone).
3406 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3407 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3408 // If this is a direct recursion edge, use NewCallee (the clone) as the
3409 // callee as well, so that any edge updated/created here is also direct
3410 // recursive.
3411 if (CalleeToUse == OldCallee) {
3412 // If this is a recursive edge, see if we already moved a recursive edge
3413 // (which would have to have been this one) - if we were only moving a
3414 // subset of context ids it would still be on OldCallee.
3415 if (EdgeIsRecursive) {
3416 assert(OldCalleeEdge == Edge);
3417 continue;
3418 }
3419 CalleeToUse = NewCallee;
3420 }
3421 // The context ids moving to the new callee are the subset of this edge's
3422 // context ids and the context ids on the caller edge being moved.
3423 DenseSet<uint32_t> EdgeContextIdsToMove =
3424 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3425 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3426 OldCalleeEdge->AllocTypes =
3427 computeAllocType(OldCalleeEdge->getContextIds());
3428 if (!NewClone) {
3429 // Update context ids / alloc type on corresponding edge to NewCallee.
3430 // There is a chance this may not exist if we are reusing an existing
3431 // clone, specifically during function assignment, where we would have
3432 // removed none type edges after creating the clone. If we can't find
3433 // a corresponding edge there, fall through to the cloning below.
3434 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3435 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3436 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3437 continue;
3438 }
3439 }
3440 auto NewEdge = std::make_shared<ContextEdge>(
3441 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3442 EdgeContextIdsToMove);
3443 NewCallee->CalleeEdges.push_back(NewEdge);
3444 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3445 }
3446 // Recompute the node alloc type now that its callee edges have been
3447 // updated (since we will compute from those edges).
3448 OldCallee->AllocTypes = OldCallee->computeAllocType();
3449 // OldCallee alloc type should be None iff its context id set is now empty.
3450 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3451 OldCallee->emptyContextIds());
3452 if (VerifyCCG) {
3453 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3454 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3455 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3456 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3457 /*CheckEdges=*/false);
3458 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3459 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3460 /*CheckEdges=*/false);
3461 }
3462}
3463
3464template <typename DerivedCCG, typename FuncTy, typename CallTy>
3465void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3466 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3467 ContextNode *NewCaller) {
3468 auto *OldCallee = Edge->Callee;
3469 auto *NewCallee = OldCallee;
3470 // If this edge was direct recursive, make any new/updated edge also direct
3471 // recursive to NewCaller.
3472 bool Recursive = Edge->Caller == Edge->Callee;
3473 if (Recursive)
3474 NewCallee = NewCaller;
3475
3476 ContextNode *OldCaller = Edge->Caller;
3477 OldCaller->eraseCalleeEdge(Edge.get());
3478
3479 // We might already have an edge to the new caller. If one exists we will
3480 // reuse it.
3481 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3482
3483 if (ExistingEdgeToNewCaller) {
3484 // Since we already have an edge to NewCaller, simply move the ids
3485 // onto it, and remove the existing Edge.
3486 ExistingEdgeToNewCaller->getContextIds().insert_range(
3487 Edge->getContextIds());
3488 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3489 Edge->ContextIds.clear();
3490 Edge->AllocTypes = (uint8_t)AllocationType::None;
3491 OldCallee->eraseCallerEdge(Edge.get());
3492 } else {
3493 // Otherwise just reconnect Edge to NewCaller.
3494 Edge->Caller = NewCaller;
3495 NewCaller->CalleeEdges.push_back(Edge);
3496 if (Recursive) {
3497 assert(NewCallee == NewCaller);
3498 // In the case of (direct) recursive edges, we update the callee as well
3499 // so that it becomes recursive on the new caller.
3500 Edge->Callee = NewCallee;
3501 NewCallee->CallerEdges.push_back(Edge);
3502 OldCallee->eraseCallerEdge(Edge.get());
3503 }
3504 // Don't need to update Edge's context ids since we are simply
3505 // reconnecting it.
3506 }
3507 // In either case, need to update the alloc types on New Caller.
3508 NewCaller->AllocTypes |= Edge->AllocTypes;
3509
3510 // Now walk the old caller node's caller edges and move Edge's context ids
3511 // over to the corresponding edge into the node (which is created here if
3512 // this is a newly created node). We can tell whether this is a newly created
3513 // node by seeing if it has any caller edges yet.
3514#ifndef NDEBUG
3515 bool IsNewNode = NewCaller->CallerEdges.empty();
3516#endif
3517 // If we just moved a direct recursive edge, presumably its context ids should
3518 // also flow out of OldCaller via some other non-recursive callee edge. We
3519 // don't want to remove the recursive context ids from other caller edges yet,
3520 // otherwise the context ids get into an inconsistent state on OldCaller.
3521 // We will update these context ids on the non-recursive caller edge when and
3522 // if they are updated on the non-recursive callee.
3523 if (!Recursive) {
3524 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3525 auto OldCallerCaller = OldCallerEdge->Caller;
3526 // The context ids moving to the new caller are the subset of this edge's
3527 // context ids and the context ids on the callee edge being moved.
3528 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3529 OldCallerEdge->getContextIds(), Edge->getContextIds());
3530 if (OldCaller == OldCallerCaller) {
3531 OldCallerCaller = NewCaller;
3532 // Don't actually move this one. The caller will move it directly via a
3533 // call to this function with this as the Edge if it is appropriate to
3534 // move to a diff node that has a matching callee (itself).
3535 continue;
3536 }
3537 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3538 OldCallerEdge->AllocTypes =
3539 computeAllocType(OldCallerEdge->getContextIds());
3540 // In this function we expect that any pre-existing node already has edges
3541 // from the same callers as the old node. That should be true in the
3542 // current use case, where we will remove None-type edges after copying
3543 // over all caller edges from the callee.
3544 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3545 // Since we would have skipped caller edges when moving a direct recursive
3546 // edge, this may not hold true when recursive handling enabled.
3547 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3548 if (ExistingCallerEdge) {
3549 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3550 ExistingCallerEdge->AllocTypes |=
3551 computeAllocType(EdgeContextIdsToMove);
3552 continue;
3553 }
3554 auto NewEdge = std::make_shared<ContextEdge>(
3555 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3556 EdgeContextIdsToMove);
3557 NewCaller->CallerEdges.push_back(NewEdge);
3558 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3559 }
3560 }
3561 // Recompute the node alloc type now that its caller edges have been
3562 // updated (since we will compute from those edges).
3563 OldCaller->AllocTypes = OldCaller->computeAllocType();
3564 // OldCaller alloc type should be None iff its context id set is now empty.
3565 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3566 OldCaller->emptyContextIds());
3567 if (VerifyCCG) {
3568 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3569 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3570 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3571 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3572 /*CheckEdges=*/false);
3573 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3574 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3575 /*CheckEdges=*/false);
3576 }
3577}
3578
3579template <typename DerivedCCG, typename FuncTy, typename CallTy>
3580void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3581 recursivelyRemoveNoneTypeCalleeEdges(
3582 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3583 auto Inserted = Visited.insert(Node);
3584 if (!Inserted.second)
3585 return;
3586
3587 removeNoneTypeCalleeEdges(Node);
3588
3589 for (auto *Clone : Node->Clones)
3590 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3591
3592 // The recursive call may remove some of this Node's caller edges.
3593 // Iterate over a copy and skip any that were removed.
3594 auto CallerEdges = Node->CallerEdges;
3595 for (auto &Edge : CallerEdges) {
3596 // Skip any that have been removed by an earlier recursive call.
3597 if (Edge->isRemoved()) {
3598 assert(!is_contained(Node->CallerEdges, Edge));
3599 continue;
3600 }
3601 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3602 }
3603}
3604
3605// This is the standard DFS based backedge discovery algorithm.
3606template <typename DerivedCCG, typename FuncTy, typename CallTy>
3607void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3608 // If we are cloning recursive contexts, find and mark backedges from all root
3609 // callers, using the typical DFS based backedge analysis.
3611 return;
3612 DenseSet<const ContextNode *> Visited;
3613 DenseSet<const ContextNode *> CurrentStack;
3614 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3615 auto *Node = Entry.second;
3616 if (Node->isRemoved())
3617 continue;
3618 // It is a root if it doesn't have callers.
3619 if (!Node->CallerEdges.empty())
3620 continue;
3621 markBackedges(Node, Visited, CurrentStack);
3622 assert(CurrentStack.empty());
3623 }
3624}
3625
3626// Recursive helper for above markBackedges method.
3627template <typename DerivedCCG, typename FuncTy, typename CallTy>
3628void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3629 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3630 DenseSet<const ContextNode *> &CurrentStack) {
3631 auto I = Visited.insert(Node);
3632 // We should only call this for unvisited nodes.
3633 assert(I.second);
3634 (void)I;
3635 for (auto &CalleeEdge : Node->CalleeEdges) {
3636 auto *Callee = CalleeEdge->Callee;
3637 if (Visited.count(Callee)) {
3638 // Since this was already visited we need to check if it is currently on
3639 // the recursive stack in which case it is a backedge.
3640 if (CurrentStack.count(Callee))
3641 CalleeEdge->IsBackedge = true;
3642 continue;
3643 }
3644 CurrentStack.insert(Callee);
3645 markBackedges(Callee, Visited, CurrentStack);
3646 CurrentStack.erase(Callee);
3647 }
3648}
3649
3650template <typename DerivedCCG, typename FuncTy, typename CallTy>
3651void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3652 DenseSet<const ContextNode *> Visited;
3653 for (auto &Entry : AllocationCallToContextNodeMap) {
3654 Visited.clear();
3655 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3656 }
3657 Visited.clear();
3658 for (auto &Entry : AllocationCallToContextNodeMap)
3659 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3660 if (VerifyCCG)
3661 check();
3662}
3663
3664// helper function to check an AllocType is cold or notcold or both.
3671
3672template <typename DerivedCCG, typename FuncTy, typename CallTy>
3673void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3674 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3675 const DenseSet<uint32_t> &AllocContextIds) {
3676 if (VerifyNodes)
3677 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3678 assert(!Node->CloneOf);
3679
3680 // If Node as a null call, then either it wasn't found in the module (regular
3681 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3682 // cloning (e.g. recursion, calls multiple targets, etc).
3683 // Do this here so that we don't try to recursively clone callers below, which
3684 // isn't useful at least for this node.
3685 if (!Node->hasCall())
3686 return;
3687
3688 // No need to look at any callers if allocation type already unambiguous.
3689 if (hasSingleAllocType(Node->AllocTypes))
3690 return;
3691
3692#ifndef NDEBUG
3693 auto Insert =
3694#endif
3695 Visited.insert(Node);
3696 // We should not have visited this node yet.
3697 assert(Insert.second);
3698 // The recursive call to identifyClones may delete the current edge from the
3699 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3700 // in an iterator and having recursive call erase from it. Other edges may
3701 // also get removed during the recursion, which will have null Callee and
3702 // Caller pointers (and are deleted later), so we skip those below.
3703 {
3704 auto CallerEdges = Node->CallerEdges;
3705 for (auto &Edge : CallerEdges) {
3706 // Skip any that have been removed by an earlier recursive call.
3707 if (Edge->isRemoved()) {
3708 assert(!is_contained(Node->CallerEdges, Edge));
3709 continue;
3710 }
3711 // Defer backedges. See comments further below where these edges are
3712 // handled during the cloning of this Node.
3713 if (Edge->IsBackedge) {
3714 // We should only mark these if cloning recursive contexts, where we
3715 // need to do this deferral.
3717 continue;
3718 }
3719 // Ignore any caller we previously visited via another edge.
3720 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3721 identifyClones(Edge->Caller, Visited, AllocContextIds);
3722 }
3723 }
3724 }
3725
3726 // Check if we reached an unambiguous call or have have only a single caller.
3727 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3728 return;
3729
3730 // We need to clone.
3731
3732 // Try to keep the original version as alloc type NotCold. This will make
3733 // cases with indirect calls or any other situation with an unknown call to
3734 // the original function get the default behavior. We do this by sorting the
3735 // CallerEdges of the Node we will clone by alloc type.
3736 //
3737 // Give NotCold edge the lowest sort priority so those edges are at the end of
3738 // the caller edges vector, and stay on the original version (since the below
3739 // code clones greedily until it finds all remaining edges have the same type
3740 // and leaves the remaining ones on the original Node).
3741 //
3742 // We shouldn't actually have any None type edges, so the sorting priority for
3743 // that is arbitrary, and we assert in that case below.
3744 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3745 /*Cold*/ 1,
3746 /*NotColdCold*/ 2};
3747 llvm::stable_sort(Node->CallerEdges,
3748 [&](const std::shared_ptr<ContextEdge> &A,
3749 const std::shared_ptr<ContextEdge> &B) {
3750 // Nodes with non-empty context ids should be sorted
3751 // before those with empty context ids.
3752 if (A->ContextIds.empty())
3753 // Either B ContextIds are non-empty (in which case we
3754 // should return false because B < A), or B ContextIds
3755 // are empty, in which case they are equal, and we
3756 // should maintain the original relative ordering.
3757 return false;
3758 if (B->ContextIds.empty())
3759 return true;
3760
3761 if (A->AllocTypes == B->AllocTypes)
3762 // Use the first context id for each edge as a
3763 // tie-breaker.
3764 return *A->ContextIds.begin() < *B->ContextIds.begin();
3765 return AllocTypeCloningPriority[A->AllocTypes] <
3766 AllocTypeCloningPriority[B->AllocTypes];
3767 });
3768
3769 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3770
3771 DenseSet<uint32_t> RecursiveContextIds;
3773 // If we are allowing recursive callsites, but have also disabled recursive
3774 // contexts, look for context ids that show up in multiple caller edges.
3776 DenseSet<uint32_t> AllCallerContextIds;
3777 for (auto &CE : Node->CallerEdges) {
3778 // Resize to the largest set of caller context ids, since we know the
3779 // final set will be at least that large.
3780 AllCallerContextIds.reserve(CE->getContextIds().size());
3781 for (auto Id : CE->getContextIds())
3782 if (!AllCallerContextIds.insert(Id).second)
3783 RecursiveContextIds.insert(Id);
3784 }
3785 }
3786
3787 // Iterate until we find no more opportunities for disambiguating the alloc
3788 // types via cloning. In most cases this loop will terminate once the Node
3789 // has a single allocation type, in which case no more cloning is needed.
3790 // Iterate over a copy of Node's caller edges, since we may need to remove
3791 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3792 // makes it less error-prone.
3793 auto CallerEdges = Node->CallerEdges;
3794 for (auto &CallerEdge : CallerEdges) {
3795 // Skip any that have been removed by an earlier recursive call.
3796 if (CallerEdge->isRemoved()) {
3797 assert(!is_contained(Node->CallerEdges, CallerEdge));
3798 continue;
3799 }
3800 assert(CallerEdge->Callee == Node);
3801
3802 // See if cloning the prior caller edge left this node with a single alloc
3803 // type or a single caller. In that case no more cloning of Node is needed.
3804 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3805 break;
3806
3807 // If the caller was not successfully matched to a call in the IR/summary,
3808 // there is no point in trying to clone for it as we can't update that call.
3809 if (!CallerEdge->Caller->hasCall())
3810 continue;
3811
3812 // Only need to process the ids along this edge pertaining to the given
3813 // allocation.
3814 auto CallerEdgeContextsForAlloc =
3815 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3816 if (!RecursiveContextIds.empty())
3817 CallerEdgeContextsForAlloc =
3818 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3819 if (CallerEdgeContextsForAlloc.empty())
3820 continue;
3821
3822 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3823
3824 // Compute the node callee edge alloc types corresponding to the context ids
3825 // for this caller edge.
3826 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3827 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3828 for (auto &CalleeEdge : Node->CalleeEdges)
3829 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3830 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3831
3832 // Don't clone if doing so will not disambiguate any alloc types amongst
3833 // caller edges (including the callee edges that would be cloned).
3834 // Otherwise we will simply move all edges to the clone.
3835 //
3836 // First check if by cloning we will disambiguate the caller allocation
3837 // type from node's allocation type. Query allocTypeToUse so that we don't
3838 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3839 // neither of these should be None type.
3840 //
3841 // Then check if by cloning node at least one of the callee edges will be
3842 // disambiguated by splitting out different context ids.
3843 //
3844 // However, always do the cloning if this is a backedge, in which case we
3845 // have not yet cloned along this caller edge.
3846 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3847 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3848 if (!CallerEdge->IsBackedge &&
3849 allocTypeToUse(CallerAllocTypeForAlloc) ==
3850 allocTypeToUse(Node->AllocTypes) &&
3851 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3852 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3853 continue;
3854 }
3855
3856 if (CallerEdge->IsBackedge) {
3857 // We should only mark these if cloning recursive contexts, where we
3858 // need to do this deferral.
3860 DeferredBackedges++;
3861 }
3862
3863 // If this is a backedge, we now do recursive cloning starting from its
3864 // caller since we may have moved unambiguous caller contexts to a clone
3865 // of this Node in a previous iteration of the current loop, giving more
3866 // opportunity for cloning through the backedge. Because we sorted the
3867 // caller edges earlier so that cold caller edges are first, we would have
3868 // visited and cloned this node for any unamibiguously cold non-recursive
3869 // callers before any ambiguous backedge callers. Note that we don't do this
3870 // if the caller is already cloned or visited during cloning (e.g. via a
3871 // different context path from the allocation).
3872 // TODO: Can we do better in the case where the caller was already visited?
3873 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3874 !Visited.count(CallerEdge->Caller)) {
3875 const auto OrigIdCount = CallerEdge->getContextIds().size();
3876 // Now do the recursive cloning of this backedge's caller, which was
3877 // deferred earlier.
3878 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3879 removeNoneTypeCalleeEdges(CallerEdge->Caller);
3880 // See if the recursive call to identifyClones moved the context ids to a
3881 // new edge from this node to a clone of caller, and switch to looking at
3882 // that new edge so that we clone Node for the new caller clone.
3883 bool UpdatedEdge = false;
3884 if (OrigIdCount > CallerEdge->getContextIds().size()) {
3885 for (auto E : Node->CallerEdges) {
3886 // Only interested in clones of the current edges caller.
3887 if (E->Caller->CloneOf != CallerEdge->Caller)
3888 continue;
3889 // See if this edge contains any of the context ids originally on the
3890 // current caller edge.
3891 auto CallerEdgeContextsForAllocNew =
3892 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3893 if (CallerEdgeContextsForAllocNew.empty())
3894 continue;
3895 // Make sure we don't pick a previously existing caller edge of this
3896 // Node, which would be processed on a different iteration of the
3897 // outer loop over the saved CallerEdges.
3898 if (llvm::is_contained(CallerEdges, E))
3899 continue;
3900 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3901 // are updated further below for all cases where we just invoked
3902 // identifyClones recursively.
3903 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3904 CallerEdge = E;
3905 UpdatedEdge = true;
3906 break;
3907 }
3908 }
3909 // If cloning removed this edge (and we didn't update it to a new edge
3910 // above), we're done with this edge. It's possible we moved all of the
3911 // context ids to an existing clone, in which case there's no need to do
3912 // further processing for them.
3913 if (CallerEdge->isRemoved())
3914 continue;
3915
3916 // Now we need to update the information used for the cloning decisions
3917 // further below, as we may have modified edges and their context ids.
3918
3919 // Note if we changed the CallerEdge above we would have already updated
3920 // the context ids.
3921 if (!UpdatedEdge) {
3922 CallerEdgeContextsForAlloc = set_intersection(
3923 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3924 if (CallerEdgeContextsForAlloc.empty())
3925 continue;
3926 }
3927 // Update the other information that depends on the edges and on the now
3928 // updated CallerEdgeContextsForAlloc.
3929 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3930 CalleeEdgeAllocTypesForCallerEdge.clear();
3931 for (auto &CalleeEdge : Node->CalleeEdges) {
3932 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3933 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3934 }
3935 }
3936
3937 // First see if we can use an existing clone. Check each clone and its
3938 // callee edges for matching alloc types.
3939 ContextNode *Clone = nullptr;
3940 for (auto *CurClone : Node->Clones) {
3941 if (allocTypeToUse(CurClone->AllocTypes) !=
3942 allocTypeToUse(CallerAllocTypeForAlloc))
3943 continue;
3944
3945 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3946 hasSingleAllocType(CallerAllocTypeForAlloc);
3947 // The above check should mean that if both have single alloc types that
3948 // they should be equal.
3949 assert(!BothSingleAlloc ||
3950 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3951
3952 // If either both have a single alloc type (which are the same), or if the
3953 // clone's callee edges have the same alloc types as those for the current
3954 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3955 // then we can reuse this clone.
3956 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3957 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3958 Clone = CurClone;
3959 break;
3960 }
3961 }
3962
3963 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3964 if (Clone)
3965 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3966 CallerEdgeContextsForAlloc);
3967 else
3968 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3969
3970 // Sanity check that no alloc types on clone or its edges are None.
3971 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3972 }
3973
3974 // We should still have some context ids on the original Node.
3975 assert(!Node->emptyContextIds());
3976
3977 // Sanity check that no alloc types on node or edges are None.
3978 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3979
3980 if (VerifyNodes)
3981 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3982}
3983
3984void ModuleCallsiteContextGraph::updateAllocationCall(
3985 CallInfo &Call, AllocationType AllocType) {
3986 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3988 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3989 "memprof", AllocTypeString);
3990 cast<CallBase>(Call.call())->addFnAttr(A);
3991 OREGetter(Call.call()->getFunction())
3992 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3993 << ore::NV("AllocationCall", Call.call()) << " in clone "
3994 << ore::NV("Caller", Call.call()->getFunction())
3995 << " marked with memprof allocation attribute "
3996 << ore::NV("Attribute", AllocTypeString));
3997}
3998
3999void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4001 auto *AI = cast<AllocInfo *>(Call.call());
4002 assert(AI);
4003 assert(AI->Versions.size() > Call.cloneNo());
4004 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4005}
4006
4008ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4009 const auto *CB = cast<CallBase>(Call.call());
4010 if (!CB->getAttributes().hasFnAttr("memprof"))
4011 return AllocationType::None;
4012 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4013 ? AllocationType::Cold
4014 : AllocationType::NotCold;
4015}
4016
4018IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4019 const auto *AI = cast<AllocInfo *>(Call.call());
4020 assert(AI->Versions.size() > Call.cloneNo());
4021 return (AllocationType)AI->Versions[Call.cloneNo()];
4022}
4023
4024void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4025 FuncInfo CalleeFunc) {
4026 auto *CurF = getCalleeFunc(CallerCall.call());
4027 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4028 if (isMemProfClone(*CurF)) {
4029 // If we already assigned this callsite to call a specific non-default
4030 // clone (i.e. not the original function which is clone 0), ensure that we
4031 // aren't trying to now update it to call a different clone, which is
4032 // indicative of a bug in the graph or function assignment.
4033 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4034 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4035 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4036 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4037 << "\n");
4038 MismatchedCloneAssignments++;
4039 }
4040 }
4041 if (NewCalleeCloneNo > 0)
4042 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4043 OREGetter(CallerCall.call()->getFunction())
4044 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4045 << ore::NV("Call", CallerCall.call()) << " in clone "
4046 << ore::NV("Caller", CallerCall.call()->getFunction())
4047 << " assigned to call function clone "
4048 << ore::NV("Callee", CalleeFunc.func()));
4049}
4050
4051void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4052 FuncInfo CalleeFunc) {
4053 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4054 assert(CI &&
4055 "Caller cannot be an allocation which should not have profiled calls");
4056 assert(CI->Clones.size() > CallerCall.cloneNo());
4057 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4058 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4059 // If we already assigned this callsite to call a specific non-default
4060 // clone (i.e. not the original function which is clone 0), ensure that we
4061 // aren't trying to now update it to call a different clone, which is
4062 // indicative of a bug in the graph or function assignment.
4063 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4064 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4065 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4066 << "\n");
4067 MismatchedCloneAssignments++;
4068 }
4069 CurCalleeCloneNo = NewCalleeCloneNo;
4070}
4071
4072// Update the debug information attached to NewFunc to use the clone Name. Note
4073// this needs to be done for both any existing DISubprogram for the definition,
4074// as well as any separate declaration DISubprogram.
4076 assert(Name == NewFunc->getName());
4077 auto *SP = NewFunc->getSubprogram();
4078 if (!SP)
4079 return;
4080 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4081 SP->replaceLinkageName(MDName);
4082 DISubprogram *Decl = SP->getDeclaration();
4083 if (!Decl)
4084 return;
4085 TempDISubprogram NewDecl = Decl->clone();
4086 NewDecl->replaceLinkageName(MDName);
4087 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4088}
4089
4090CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4091 Instruction *>::FuncInfo
4092ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4093 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4094 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4095 // Use existing LLVM facilities for cloning and obtaining Call in clone
4096 ValueToValueMapTy VMap;
4097 auto *NewFunc = CloneFunction(Func.func(), VMap);
4098 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4099 assert(!Func.func()->getParent()->getFunction(Name));
4100 NewFunc->setName(Name);
4101 updateSubprogramLinkageName(NewFunc, Name);
4102 for (auto &Inst : CallsWithMetadataInFunc) {
4103 // This map always has the initial version in it.
4104 assert(Inst.cloneNo() == 0);
4105 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4106 }
4107 OREGetter(Func.func())
4108 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4109 << "created clone " << ore::NV("NewFunction", NewFunc));
4110 return {NewFunc, CloneNo};
4111}
4112
4113CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4114 IndexCall>::FuncInfo
4115IndexCallsiteContextGraph::cloneFunctionForCallsite(
4116 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4117 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4118 // Check how many clones we have of Call (and therefore function).
4119 // The next clone number is the current size of versions array.
4120 // Confirm this matches the CloneNo provided by the caller, which is based on
4121 // the number of function clones we have.
4122 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4123 ? cast<AllocInfo *>(Call.call())->Versions.size()
4124 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4125 // Walk all the instructions in this function. Create a new version for
4126 // each (by adding an entry to the Versions/Clones summary array), and copy
4127 // over the version being called for the function clone being cloned here.
4128 // Additionally, add an entry to the CallMap for the new function clone,
4129 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4130 // to the new call clone.
4131 for (auto &Inst : CallsWithMetadataInFunc) {
4132 // This map always has the initial version in it.
4133 assert(Inst.cloneNo() == 0);
4134 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4135 assert(AI->Versions.size() == CloneNo);
4136 // We assign the allocation type later (in updateAllocationCall), just add
4137 // an entry for it here.
4138 AI->Versions.push_back(0);
4139 } else {
4140 auto *CI = cast<CallsiteInfo *>(Inst.call());
4141 assert(CI && CI->Clones.size() == CloneNo);
4142 // We assign the clone number later (in updateCall), just add an entry for
4143 // it here.
4144 CI->Clones.push_back(0);
4145 }
4146 CallMap[Inst] = {Inst.call(), CloneNo};
4147 }
4148 return {Func.func(), CloneNo};
4149}
4150
4151// We perform cloning for each allocation node separately. However, this
4152// sometimes results in a situation where the same node calls multiple
4153// clones of the same callee, created for different allocations. This
4154// causes issues when assigning functions to these clones, as each node can
4155// in reality only call a single callee clone.
4156//
4157// To address this, before assigning functions, merge callee clone nodes as
4158// needed using a post order traversal from the allocations. We attempt to
4159// use existing clones as the merge node when legal, and to share them
4160// among callers with the same properties (callers calling the same set of
4161// callee clone nodes for the same allocations).
4162//
4163// Without this fix, in some cases incorrect function assignment will lead
4164// to calling the wrong allocation clone.
4165template <typename DerivedCCG, typename FuncTy, typename CallTy>
4166void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4167 if (!MergeClones)
4168 return;
4169
4170 // Generate a map from context id to the associated allocation node for use
4171 // when merging clones.
4172 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4173 for (auto &Entry : AllocationCallToContextNodeMap) {
4174 auto *Node = Entry.second;
4175 for (auto Id : Node->getContextIds())
4176 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4177 for (auto *Clone : Node->Clones) {
4178 for (auto Id : Clone->getContextIds())
4179 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4180 }
4181 }
4182
4183 // Post order traversal starting from allocations to ensure each callsite
4184 // calls a single clone of its callee. Callee nodes that are clones of each
4185 // other are merged (via new merge nodes if needed) to achieve this.
4186 DenseSet<const ContextNode *> Visited;
4187 for (auto &Entry : AllocationCallToContextNodeMap) {
4188 auto *Node = Entry.second;
4189
4190 mergeClones(Node, Visited, ContextIdToAllocationNode);
4191
4192 // Make a copy so the recursive post order traversal that may create new
4193 // clones doesn't mess up iteration. Note that the recursive traversal
4194 // itself does not call mergeClones on any of these nodes, which are all
4195 // (clones of) allocations.
4196 auto Clones = Node->Clones;
4197 for (auto *Clone : Clones)
4198 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4199 }
4200
4201 if (DumpCCG) {
4202 dbgs() << "CCG after merging:\n";
4203 dbgs() << *this;
4204 }
4205 if (ExportToDot)
4206 exportToDot("aftermerge");
4207
4208 if (VerifyCCG) {
4209 check();
4210 }
4211}
4212
4213// Recursive helper for above mergeClones method.
4214template <typename DerivedCCG, typename FuncTy, typename CallTy>
4215void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4216 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4217 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4218 auto Inserted = Visited.insert(Node);
4219 if (!Inserted.second)
4220 return;
4221
4222 // Iteratively perform merging on this node to handle new caller nodes created
4223 // during the recursive traversal. We could do something more elegant such as
4224 // maintain a worklist, but this is a simple approach that doesn't cause a
4225 // measureable compile time effect, as most nodes don't have many caller
4226 // edges to check.
4227 bool FoundUnvisited = true;
4228 unsigned Iters = 0;
4229 while (FoundUnvisited) {
4230 Iters++;
4231 FoundUnvisited = false;
4232 // Make a copy since the recursive call may move a caller edge to a new
4233 // callee, messing up the iterator.
4234 auto CallerEdges = Node->CallerEdges;
4235 for (auto CallerEdge : CallerEdges) {
4236 // Skip any caller edge moved onto a different callee during recursion.
4237 if (CallerEdge->Callee != Node)
4238 continue;
4239 // If we found an unvisited caller, note that we should check the caller
4240 // edges again as mergeClones may add or change caller nodes.
4241 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4242 FoundUnvisited = true;
4243 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4244 }
4245 }
4246
4247 TotalMergeInvokes++;
4248 TotalMergeIters += Iters;
4249 if (Iters > MaxMergeIters)
4250 MaxMergeIters = Iters;
4251
4252 // Merge for this node after we handle its callers.
4253 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4254}
4255
4256template <typename DerivedCCG, typename FuncTy, typename CallTy>
4257void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4258 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4259 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4260 // Ignore Node if we moved all of its contexts to clones.
4261 if (Node->emptyContextIds())
4262 return;
4263
4264 // First identify groups of clones among Node's callee edges, by building
4265 // a map from each callee base node to the associated callee edges from Node.
4266 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4267 OrigNodeToCloneEdges;
4268 for (const auto &E : Node->CalleeEdges) {
4269 auto *Callee = E->Callee;
4270 if (!Callee->CloneOf && Callee->Clones.empty())
4271 continue;
4272 ContextNode *Base = Callee->getOrigNode();
4273 OrigNodeToCloneEdges[Base].push_back(E);
4274 }
4275
4276 // Helper for callee edge sorting below. Return true if A's callee has fewer
4277 // caller edges than B, or if A is a clone and B is not, or if A's first
4278 // context id is smaller than B's.
4279 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4280 const std::shared_ptr<ContextEdge> &B) {
4281 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4282 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4283 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4284 return true;
4285 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4286 return false;
4287 // Use the first context id for each edge as a
4288 // tie-breaker.
4289 return *A->ContextIds.begin() < *B->ContextIds.begin();
4290 };
4291
4292 // Process each set of callee clones called by Node, performing the needed
4293 // merging.
4294 for (auto Entry : OrigNodeToCloneEdges) {
4295 // CalleeEdges is the set of edges from Node reaching callees that are
4296 // mutual clones of each other.
4297 auto &CalleeEdges = Entry.second;
4298 auto NumCalleeClones = CalleeEdges.size();
4299 // A single edge means there is no merging needed.
4300 if (NumCalleeClones == 1)
4301 continue;
4302 // Sort the CalleeEdges calling this group of clones in ascending order of
4303 // their caller edge counts, putting the original non-clone node first in
4304 // cases of a tie. This simplifies finding an existing node to use as the
4305 // merge node.
4306 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4307
4308 /// Find other callers of the given set of callee edges that can
4309 /// share the same callee merge node. See the comments at this method
4310 /// definition for details.
4311 DenseSet<ContextNode *> OtherCallersToShareMerge;
4312 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4313 OtherCallersToShareMerge);
4314
4315 // Now do the actual merging. Identify existing or create a new MergeNode
4316 // during the first iteration. Move each callee over, along with edges from
4317 // other callers we've determined above can share the same merge node.
4318 ContextNode *MergeNode = nullptr;
4319 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4320 for (auto CalleeEdge : CalleeEdges) {
4321 auto *OrigCallee = CalleeEdge->Callee;
4322 // If we don't have a MergeNode yet (only happens on the first iteration,
4323 // as a new one will be created when we go to move the first callee edge
4324 // over as needed), see if we can use this callee.
4325 if (!MergeNode) {
4326 // If there are no other callers, simply use this callee.
4327 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4328 MergeNode = OrigCallee;
4329 NonNewMergedNodes++;
4330 continue;
4331 }
4332 // Otherwise, if we have identified other caller nodes that can share
4333 // the merge node with Node, see if all of OrigCallee's callers are
4334 // going to share the same merge node. In that case we can use callee
4335 // (since all of its callers would move to the new merge node).
4336 if (!OtherCallersToShareMerge.empty()) {
4337 bool MoveAllCallerEdges = true;
4338 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4339 if (CalleeCallerE == CalleeEdge)
4340 continue;
4341 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4342 MoveAllCallerEdges = false;
4343 break;
4344 }
4345 }
4346 // If we are going to move all callers over, we can use this callee as
4347 // the MergeNode.
4348 if (MoveAllCallerEdges) {
4349 MergeNode = OrigCallee;
4350 NonNewMergedNodes++;
4351 continue;
4352 }
4353 }
4354 }
4355 // Move this callee edge, creating a new merge node if necessary.
4356 if (MergeNode) {
4357 assert(MergeNode != OrigCallee);
4358 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4359 /*NewClone*/ false);
4360 } else {
4361 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4362 NewMergedNodes++;
4363 }
4364 // Now move all identified edges from other callers over to the merge node
4365 // as well.
4366 if (!OtherCallersToShareMerge.empty()) {
4367 // Make and iterate over a copy of OrigCallee's caller edges because
4368 // some of these will be moved off of the OrigCallee and that would mess
4369 // up the iteration from OrigCallee.
4370 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4371 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4372 if (CalleeCallerE == CalleeEdge)
4373 continue;
4374 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4375 continue;
4376 CallerToMoveCount[CalleeCallerE->Caller]++;
4377 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4378 /*NewClone*/ false);
4379 }
4380 }
4381 removeNoneTypeCalleeEdges(OrigCallee);
4382 removeNoneTypeCalleeEdges(MergeNode);
4383 }
4384 }
4385}
4386
4387// Look for other nodes that have edges to the same set of callee
4388// clones as the current Node. Those can share the eventual merge node
4389// (reducing cloning and binary size overhead) iff:
4390// - they have edges to the same set of callee clones
4391// - each callee edge reaches a subset of the same allocations as Node's
4392// corresponding edge to the same callee clone.
4393// The second requirement is to ensure that we don't undo any of the
4394// necessary cloning to distinguish contexts with different allocation
4395// behavior.
4396// FIXME: This is somewhat conservative, as we really just need to ensure
4397// that they don't reach the same allocations as contexts on edges from Node
4398// going to any of the *other* callee clones being merged. However, that
4399// requires more tracking and checking to get right.
4400template <typename DerivedCCG, typename FuncTy, typename CallTy>
4401void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4402 findOtherCallersToShareMerge(
4403 ContextNode *Node,
4404 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4405 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4406 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4407 auto NumCalleeClones = CalleeEdges.size();
4408 // This map counts how many edges to the same callee clone exist for other
4409 // caller nodes of each callee clone.
4410 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4411 // Counts the number of other caller nodes that have edges to all callee
4412 // clones that don't violate the allocation context checking.
4413 unsigned PossibleOtherCallerNodes = 0;
4414
4415 // We only need to look at other Caller nodes if the first callee edge has
4416 // multiple callers (recall they are sorted in ascending order above).
4417 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4418 return;
4419
4420 // For each callee edge:
4421 // - Collect the count of other caller nodes calling the same callees.
4422 // - Collect the alloc nodes reached by contexts on each callee edge.
4423 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4424 for (auto CalleeEdge : CalleeEdges) {
4425 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4426 // For each other caller of the same callee, increment the count of
4427 // edges reaching the same callee clone.
4428 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4429 if (CalleeCallerEdges->Caller == Node) {
4430 assert(CalleeCallerEdges == CalleeEdge);
4431 continue;
4432 }
4433 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4434 // If this caller edge now reaches all of the same callee clones,
4435 // increment the count of candidate other caller nodes.
4436 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4437 NumCalleeClones)
4438 PossibleOtherCallerNodes++;
4439 }
4440 // Collect the alloc nodes reached by contexts on each callee edge, for
4441 // later analysis.
4442 for (auto Id : CalleeEdge->getContextIds()) {
4443 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4444 if (!Alloc) {
4445 // FIXME: unclear why this happens occasionally, presumably
4446 // imperfect graph updates possibly with recursion.
4447 MissingAllocForContextId++;
4448 continue;
4449 }
4450 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4451 }
4452 }
4453
4454 // Now walk the callee edges again, and make sure that for each candidate
4455 // caller node all of its edges to the callees reach the same allocs (or
4456 // a subset) as those along the corresponding callee edge from Node.
4457 for (auto CalleeEdge : CalleeEdges) {
4458 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4459 // Stop if we do not have any (more) candidate other caller nodes.
4460 if (!PossibleOtherCallerNodes)
4461 break;
4462 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4463 // Check each other caller of this callee clone.
4464 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4465 // Not interested in the callee edge from Node itself.
4466 if (CalleeCallerE == CalleeEdge)
4467 continue;
4468 // Skip any callers that didn't have callee edges to all the same
4469 // callee clones.
4470 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4471 NumCalleeClones)
4472 continue;
4473 // Make sure that each context along edge from candidate caller node
4474 // reaches an allocation also reached by this callee edge from Node.
4475 for (auto Id : CalleeCallerE->getContextIds()) {
4476 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4477 if (!Alloc)
4478 continue;
4479 // If not, simply reset the map entry to 0 so caller is ignored, and
4480 // reduce the count of candidate other caller nodes.
4481 if (!CurCalleeAllocNodes.contains(Alloc)) {
4482 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4483 PossibleOtherCallerNodes--;
4484 break;
4485 }
4486 }
4487 }
4488 }
4489
4490 if (!PossibleOtherCallerNodes)
4491 return;
4492
4493 // Build the set of other caller nodes that can use the same callee merge
4494 // node.
4495 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4496 if (Count != NumCalleeClones)
4497 continue;
4498 OtherCallersToShareMerge.insert(OtherCaller);
4499 }
4500}
4501
4502// This method assigns cloned callsites to functions, cloning the functions as
4503// needed. The assignment is greedy and proceeds roughly as follows:
4504//
4505// For each function Func:
4506// For each call with graph Node having clones:
4507// Initialize ClonesWorklist to Node and its clones
4508// Initialize NodeCloneCount to 0
4509// While ClonesWorklist is not empty:
4510// Clone = pop front ClonesWorklist
4511// NodeCloneCount++
4512// If Func has been cloned less than NodeCloneCount times:
4513// If NodeCloneCount is 1:
4514// Assign Clone to original Func
4515// Continue
4516// Create a new function clone
4517// If other callers not assigned to call a function clone yet:
4518// Assign them to call new function clone
4519// Continue
4520// Assign any other caller calling the cloned version to new clone
4521//
4522// For each caller of Clone:
4523// If caller is assigned to call a specific function clone:
4524// If we cannot assign Clone to that function clone:
4525// Create new callsite Clone NewClone
4526// Add NewClone to ClonesWorklist
4527// Continue
4528// Assign Clone to existing caller's called function clone
4529// Else:
4530// If Clone not already assigned to a function clone:
4531// Assign to first function clone without assignment
4532// Assign caller to selected function clone
4533// For each call with graph Node having clones:
4534// If number func clones > number call's callsite Node clones:
4535// Record func CallInfo clones without Node clone in UnassignedCallClones
4536// For callsite Nodes in DFS order from allocations:
4537// If IsAllocation:
4538// Update allocation with alloc type
4539// Else:
4540// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4541// Update call to call recorded callee clone
4542//
4543template <typename DerivedCCG, typename FuncTy, typename CallTy>
4544bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4545 bool Changed = false;
4546
4547 mergeClones();
4548
4549 // Keep track of the assignment of nodes (callsites) to function clones they
4550 // call.
4551 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4552
4553 // Update caller node to call function version CalleeFunc, by recording the
4554 // assignment in CallsiteToCalleeFuncCloneMap.
4555 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4556 const FuncInfo &CalleeFunc) {
4557 assert(Caller->hasCall());
4558 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4559 };
4560
4561 // Information for a single clone of this Func.
4562 struct FuncCloneInfo {
4563 // The function clone.
4564 FuncInfo FuncClone;
4565 // Remappings of each call of interest (from original uncloned call to the
4566 // corresponding cloned call in this function clone).
4567 DenseMap<CallInfo, CallInfo> CallMap;
4568 };
4569
4570 // Map to keep track of information needed to update calls in function clones
4571 // when their corresponding callsite node was not itself cloned for that
4572 // function clone. Because of call context pruning (i.e. we only keep as much
4573 // caller information as needed to distinguish hot vs cold), we may not have
4574 // caller edges coming to each callsite node from all possible function
4575 // callers. A function clone may get created for other callsites in the
4576 // function for which there are caller edges that were not pruned. Any other
4577 // callsites in that function clone, which were not themselved cloned for
4578 // that function clone, should get updated the same way as the corresponding
4579 // callsite in the original function (which may call a clone of its callee).
4580 //
4581 // We build this map after completing function cloning for each function, so
4582 // that we can record the information from its call maps before they are
4583 // destructed. The map will be used as we update calls to update any still
4584 // unassigned call clones. Note that we may create new node clones as we clone
4585 // other functions, so later on we check which node clones were still not
4586 // created. To this end, the inner map is a map from function clone number to
4587 // the list of calls cloned for that function (can be more than one due to the
4588 // Node's MatchingCalls array).
4589 //
4590 // The alternative is creating new callsite clone nodes below as we clone the
4591 // function, but that is tricker to get right and likely more overhead.
4592 //
4593 // Inner map is a std::map so sorted by key (clone number), in order to get
4594 // ordered remarks in the full LTO case.
4595 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4596 UnassignedCallClones;
4597
4598 // Walk all functions for which we saw calls with memprof metadata, and handle
4599 // cloning for each of its calls.
4600 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4601 FuncInfo OrigFunc(Func);
4602 // Map from each clone number of OrigFunc to information about that function
4603 // clone (the function clone FuncInfo and call remappings). The index into
4604 // the vector is the clone number, as function clones are created and
4605 // numbered sequentially.
4606 std::vector<FuncCloneInfo> FuncCloneInfos;
4607 for (auto &Call : CallsWithMetadata) {
4608 ContextNode *Node = getNodeForInst(Call);
4609 // Skip call if we do not have a node for it (all uses of its stack ids
4610 // were either on inlined chains or pruned from the MIBs), or if we did
4611 // not create any clones for it.
4612 if (!Node || Node->Clones.empty())
4613 continue;
4614 assert(Node->hasCall() &&
4615 "Not having a call should have prevented cloning");
4616
4617 // Track the assignment of function clones to clones of the current
4618 // callsite Node being handled.
4619 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4620
4621 // Assign callsite version CallsiteClone to function version FuncClone,
4622 // and also assign (possibly cloned) Call to CallsiteClone.
4623 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4624 CallInfo &Call,
4625 ContextNode *CallsiteClone,
4626 bool IsAlloc) {
4627 // Record the clone of callsite node assigned to this function clone.
4628 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4629
4630 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4631 DenseMap<CallInfo, CallInfo> &CallMap =
4632 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4633 CallInfo CallClone(Call);
4634 if (auto It = CallMap.find(Call); It != CallMap.end())
4635 CallClone = It->second;
4636 CallsiteClone->setCall(CallClone);
4637 // Need to do the same for all matching calls.
4638 for (auto &MatchingCall : Node->MatchingCalls) {
4639 CallInfo CallClone(MatchingCall);
4640 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4641 CallClone = It->second;
4642 // Updates the call in the list.
4643 MatchingCall = CallClone;
4644 }
4645 };
4646
4647 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4648 // performs the necessary fixups (removing none type edges, and
4649 // importantly, propagating any function call assignment of the original
4650 // node to the new clone).
4651 auto MoveEdgeToNewCalleeCloneAndSetUp =
4652 [&](const std::shared_ptr<ContextEdge> &Edge) {
4653 ContextNode *OrigCallee = Edge->Callee;
4654 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4655 removeNoneTypeCalleeEdges(NewClone);
4656 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4657 // If the original Callee was already assigned to call a specific
4658 // function version, make sure its new clone is assigned to call
4659 // that same function clone.
4660 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4661 RecordCalleeFuncOfCallsite(
4662 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4663 return NewClone;
4664 };
4665
4666 // Keep track of the clones of callsite Node that need to be assigned to
4667 // function clones. This list may be expanded in the loop body below if we
4668 // find additional cloning is required.
4669 std::deque<ContextNode *> ClonesWorklist;
4670 // Ignore original Node if we moved all of its contexts to clones.
4671 if (!Node->emptyContextIds())
4672 ClonesWorklist.push_back(Node);
4673 llvm::append_range(ClonesWorklist, Node->Clones);
4674
4675 // Now walk through all of the clones of this callsite Node that we need,
4676 // and determine the assignment to a corresponding clone of the current
4677 // function (creating new function clones as needed).
4678 unsigned NodeCloneCount = 0;
4679 while (!ClonesWorklist.empty()) {
4680 ContextNode *Clone = ClonesWorklist.front();
4681 ClonesWorklist.pop_front();
4682 NodeCloneCount++;
4683 if (VerifyNodes)
4685
4686 // Need to create a new function clone if we have more callsite clones
4687 // than existing function clones, which would have been assigned to an
4688 // earlier clone in the list (we assign callsite clones to function
4689 // clones greedily).
4690 if (FuncCloneInfos.size() < NodeCloneCount) {
4691 // If this is the first callsite copy, assign to original function.
4692 if (NodeCloneCount == 1) {
4693 // Since FuncCloneInfos is empty in this case, no clones have
4694 // been created for this function yet, and no callers should have
4695 // been assigned a function clone for this callee node yet.
4697 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4698 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4699 }));
4700 // Initialize with empty call map, assign Clone to original function
4701 // and its callers, and skip to the next clone.
4702 FuncCloneInfos.push_back(
4703 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4704 AssignCallsiteCloneToFuncClone(
4705 OrigFunc, Call, Clone,
4706 AllocationCallToContextNodeMap.count(Call));
4707 for (auto &CE : Clone->CallerEdges) {
4708 // Ignore any caller that does not have a recorded callsite Call.
4709 if (!CE->Caller->hasCall())
4710 continue;
4711 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4712 }
4713 continue;
4714 }
4715
4716 // First locate which copy of OrigFunc to clone again. If a caller
4717 // of this callsite clone was already assigned to call a particular
4718 // function clone, we need to redirect all of those callers to the
4719 // new function clone, and update their other callees within this
4720 // function.
4721 FuncInfo PreviousAssignedFuncClone;
4722 auto EI = llvm::find_if(
4723 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4724 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4725 });
4726 bool CallerAssignedToCloneOfFunc = false;
4727 if (EI != Clone->CallerEdges.end()) {
4728 const std::shared_ptr<ContextEdge> &Edge = *EI;
4729 PreviousAssignedFuncClone =
4730 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4731 CallerAssignedToCloneOfFunc = true;
4732 }
4733
4734 // Clone function and save it along with the CallInfo map created
4735 // during cloning in the FuncCloneInfos.
4736 DenseMap<CallInfo, CallInfo> NewCallMap;
4737 unsigned CloneNo = FuncCloneInfos.size();
4738 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4739 "should already exist in the map");
4740 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4741 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
4742 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
4743 FunctionClonesAnalysis++;
4744 Changed = true;
4745
4746 // If no caller callsites were already assigned to a clone of this
4747 // function, we can simply assign this clone to the new func clone
4748 // and update all callers to it, then skip to the next clone.
4749 if (!CallerAssignedToCloneOfFunc) {
4750 AssignCallsiteCloneToFuncClone(
4751 NewFuncClone, Call, Clone,
4752 AllocationCallToContextNodeMap.count(Call));
4753 for (auto &CE : Clone->CallerEdges) {
4754 // Ignore any caller that does not have a recorded callsite Call.
4755 if (!CE->Caller->hasCall())
4756 continue;
4757 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4758 }
4759 continue;
4760 }
4761
4762 // We may need to do additional node cloning in this case.
4763 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4764 // that were previously assigned to call PreviousAssignedFuncClone,
4765 // to record that they now call NewFuncClone.
4766 // The none type edge removal may remove some of this Clone's caller
4767 // edges, if it is reached via another of its caller's callees.
4768 // Iterate over a copy and skip any that were removed.
4769 auto CallerEdges = Clone->CallerEdges;
4770 for (auto CE : CallerEdges) {
4771 // Skip any that have been removed on an earlier iteration.
4772 if (CE->isRemoved()) {
4773 assert(!is_contained(Clone->CallerEdges, CE));
4774 continue;
4775 }
4776 assert(CE);
4777 // Ignore any caller that does not have a recorded callsite Call.
4778 if (!CE->Caller->hasCall())
4779 continue;
4780
4781 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
4782 // We subsequently fall through to later handling that
4783 // will perform any additional cloning required for
4784 // callers that were calling other function clones.
4785 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4786 PreviousAssignedFuncClone)
4787 continue;
4788
4789 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4790
4791 // If we are cloning a function that was already assigned to some
4792 // callers, then essentially we are creating new callsite clones
4793 // of the other callsites in that function that are reached by those
4794 // callers. Clone the other callees of the current callsite's caller
4795 // that were already assigned to PreviousAssignedFuncClone
4796 // accordingly. This is important since we subsequently update the
4797 // calls from the nodes in the graph and their assignments to callee
4798 // functions recorded in CallsiteToCalleeFuncCloneMap.
4799 // The none type edge removal may remove some of this caller's
4800 // callee edges, if it is reached via another of its callees.
4801 // Iterate over a copy and skip any that were removed.
4802 auto CalleeEdges = CE->Caller->CalleeEdges;
4803 for (auto CalleeEdge : CalleeEdges) {
4804 // Skip any that have been removed on an earlier iteration when
4805 // cleaning up newly None type callee edges.
4806 if (CalleeEdge->isRemoved()) {
4807 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4808 continue;
4809 }
4810 assert(CalleeEdge);
4811 ContextNode *Callee = CalleeEdge->Callee;
4812 // Skip the current callsite, we are looking for other
4813 // callsites Caller calls, as well as any that does not have a
4814 // recorded callsite Call.
4815 if (Callee == Clone || !Callee->hasCall())
4816 continue;
4817 // Skip direct recursive calls. We don't need/want to clone the
4818 // caller node again, and this loop will not behave as expected if
4819 // we tried.
4820 if (Callee == CalleeEdge->Caller)
4821 continue;
4822 ContextNode *NewClone =
4823 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
4824 // Moving the edge may have resulted in some none type
4825 // callee edges on the original Callee.
4826 removeNoneTypeCalleeEdges(Callee);
4827 // Update NewClone with the new Call clone of this callsite's Call
4828 // created for the new function clone created earlier.
4829 // Recall that we have already ensured when building the graph
4830 // that each caller can only call callsites within the same
4831 // function, so we are guaranteed that Callee Call is in the
4832 // current OrigFunc.
4833 // CallMap is set up as indexed by original Call at clone 0.
4834 CallInfo OrigCall(Callee->getOrigNode()->Call);
4835 OrigCall.setCloneNo(0);
4836 DenseMap<CallInfo, CallInfo> &CallMap =
4837 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
4838 assert(CallMap.count(OrigCall));
4839 CallInfo NewCall(CallMap[OrigCall]);
4840 assert(NewCall);
4841 NewClone->setCall(NewCall);
4842 // Need to do the same for all matching calls.
4843 for (auto &MatchingCall : NewClone->MatchingCalls) {
4844 CallInfo OrigMatchingCall(MatchingCall);
4845 OrigMatchingCall.setCloneNo(0);
4846 assert(CallMap.count(OrigMatchingCall));
4847 CallInfo NewCall(CallMap[OrigMatchingCall]);
4848 assert(NewCall);
4849 // Updates the call in the list.
4850 MatchingCall = NewCall;
4851 }
4852 }
4853 }
4854 // Fall through to handling below to perform the recording of the
4855 // function for this callsite clone. This enables handling of cases
4856 // where the callers were assigned to different clones of a function.
4857 }
4858
4859 auto FindFirstAvailFuncClone = [&]() {
4860 // Find first function in FuncCloneInfos without an assigned
4861 // clone of this callsite Node. We should always have one
4862 // available at this point due to the earlier cloning when the
4863 // FuncCloneInfos size was smaller than the clone number.
4864 for (auto &CF : FuncCloneInfos) {
4865 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
4866 return CF.FuncClone;
4867 }
4869 "Expected an available func clone for this callsite clone");
4870 };
4871
4872 // See if we can use existing function clone. Walk through
4873 // all caller edges to see if any have already been assigned to
4874 // a clone of this callsite's function. If we can use it, do so. If not,
4875 // because that function clone is already assigned to a different clone
4876 // of this callsite, then we need to clone again.
4877 // Basically, this checking is needed to handle the case where different
4878 // caller functions/callsites may need versions of this function
4879 // containing different mixes of callsite clones across the different
4880 // callsites within the function. If that happens, we need to create
4881 // additional function clones to handle the various combinations.
4882 //
4883 // Keep track of any new clones of this callsite created by the
4884 // following loop, as well as any existing clone that we decided to
4885 // assign this clone to.
4886 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4887 FuncInfo FuncCloneAssignedToCurCallsiteClone;
4888 // Iterate over a copy of Clone's caller edges, since we may need to
4889 // remove edges in the moveEdgeTo* methods, and this simplifies the
4890 // handling and makes it less error-prone.
4891 auto CloneCallerEdges = Clone->CallerEdges;
4892 for (auto &Edge : CloneCallerEdges) {
4893 // Skip removed edges (due to direct recursive edges updated when
4894 // updating callee edges when moving an edge and subsequently
4895 // removed by call to removeNoneTypeCalleeEdges on the Clone).
4896 if (Edge->isRemoved())
4897 continue;
4898 // Ignore any caller that does not have a recorded callsite Call.
4899 if (!Edge->Caller->hasCall())
4900 continue;
4901 // If this caller already assigned to call a version of OrigFunc, need
4902 // to ensure we can assign this callsite clone to that function clone.
4903 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4904 FuncInfo FuncCloneCalledByCaller =
4905 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4906 // First we need to confirm that this function clone is available
4907 // for use by this callsite node clone.
4908 //
4909 // While FuncCloneToCurNodeCloneMap is built only for this Node and
4910 // its callsite clones, one of those callsite clones X could have
4911 // been assigned to the same function clone called by Edge's caller
4912 // - if Edge's caller calls another callsite within Node's original
4913 // function, and that callsite has another caller reaching clone X.
4914 // We need to clone Node again in this case.
4915 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4916 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4917 Clone) ||
4918 // Detect when we have multiple callers of this callsite that
4919 // have already been assigned to specific, and different, clones
4920 // of OrigFunc (due to other unrelated callsites in Func they
4921 // reach via call contexts). Is this Clone of callsite Node
4922 // assigned to a different clone of OrigFunc? If so, clone Node
4923 // again.
4924 (FuncCloneAssignedToCurCallsiteClone &&
4925 FuncCloneAssignedToCurCallsiteClone !=
4926 FuncCloneCalledByCaller)) {
4927 // We need to use a different newly created callsite clone, in
4928 // order to assign it to another new function clone on a
4929 // subsequent iteration over the Clones array (adjusted below).
4930 // Note we specifically do not reset the
4931 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
4932 // when this new clone is processed later we know which version of
4933 // the function to copy (so that other callsite clones we have
4934 // assigned to that function clone are properly cloned over). See
4935 // comments in the function cloning handling earlier.
4936
4937 // Check if we already have cloned this callsite again while
4938 // walking through caller edges, for a caller calling the same
4939 // function clone. If so, we can move this edge to that new clone
4940 // rather than creating yet another new clone.
4941 if (FuncCloneToNewCallsiteCloneMap.count(
4942 FuncCloneCalledByCaller)) {
4943 ContextNode *NewClone =
4944 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4945 moveEdgeToExistingCalleeClone(Edge, NewClone);
4946 // Cleanup any none type edges cloned over.
4947 removeNoneTypeCalleeEdges(NewClone);
4948 } else {
4949 // Create a new callsite clone.
4950 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
4951 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4952 NewClone;
4953 // Add to list of clones and process later.
4954 ClonesWorklist.push_back(NewClone);
4955 }
4956 // Moving the caller edge may have resulted in some none type
4957 // callee edges.
4958 removeNoneTypeCalleeEdges(Clone);
4959 // We will handle the newly created callsite clone in a subsequent
4960 // iteration over this Node's Clones.
4961 continue;
4962 }
4963
4964 // Otherwise, we can use the function clone already assigned to this
4965 // caller.
4966 if (!FuncCloneAssignedToCurCallsiteClone) {
4967 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4968 // Assign Clone to FuncCloneCalledByCaller
4969 AssignCallsiteCloneToFuncClone(
4970 FuncCloneCalledByCaller, Call, Clone,
4971 AllocationCallToContextNodeMap.count(Call));
4972 } else
4973 // Don't need to do anything - callsite is already calling this
4974 // function clone.
4975 assert(FuncCloneAssignedToCurCallsiteClone ==
4976 FuncCloneCalledByCaller);
4977
4978 } else {
4979 // We have not already assigned this caller to a version of
4980 // OrigFunc. Do the assignment now.
4981
4982 // First check if we have already assigned this callsite clone to a
4983 // clone of OrigFunc for another caller during this iteration over
4984 // its caller edges.
4985 if (!FuncCloneAssignedToCurCallsiteClone) {
4986 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
4987 assert(FuncCloneAssignedToCurCallsiteClone);
4988 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4989 AssignCallsiteCloneToFuncClone(
4990 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4991 AllocationCallToContextNodeMap.count(Call));
4992 } else
4993 assert(FuncCloneToCurNodeCloneMap
4994 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4995 // Update callers to record function version called.
4996 RecordCalleeFuncOfCallsite(Edge->Caller,
4997 FuncCloneAssignedToCurCallsiteClone);
4998 }
4999 }
5000 // If we didn't assign a function clone to this callsite clone yet, e.g.
5001 // none of its callers has a non-null call, do the assignment here.
5002 // We want to ensure that every callsite clone is assigned to some
5003 // function clone, so that the call updates below work as expected.
5004 // In particular if this is the original callsite, we want to ensure it
5005 // is assigned to the original function, otherwise the original function
5006 // will appear available for assignment to other callsite clones,
5007 // leading to unintended effects. For one, the unknown and not updated
5008 // callers will call into cloned paths leading to the wrong hints,
5009 // because they still call the original function (clone 0). Also,
5010 // because all callsites start out as being clone 0 by default, we can't
5011 // easily distinguish between callsites explicitly assigned to clone 0
5012 // vs those never assigned, which can lead to multiple updates of the
5013 // calls when invoking updateCall below, with mismatched clone values.
5014 // TODO: Add a flag to the callsite nodes or some other mechanism to
5015 // better distinguish and identify callsite clones that are not getting
5016 // assigned to function clones as expected.
5017 if (!FuncCloneAssignedToCurCallsiteClone) {
5018 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5019 assert(FuncCloneAssignedToCurCallsiteClone &&
5020 "No available func clone for this callsite clone");
5021 AssignCallsiteCloneToFuncClone(
5022 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5023 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5024 }
5025 }
5026 if (VerifyCCG) {
5028 for (const auto &PE : Node->CalleeEdges)
5030 for (const auto &CE : Node->CallerEdges)
5032 for (auto *Clone : Node->Clones) {
5034 for (const auto &PE : Clone->CalleeEdges)
5036 for (const auto &CE : Clone->CallerEdges)
5038 }
5039 }
5040 }
5041
5042 if (FuncCloneInfos.size() < 2)
5043 continue;
5044
5045 // In this case there is more than just the original function copy.
5046 // Record call clones of any callsite nodes in the function that did not
5047 // themselves get cloned for all of the function clones.
5048 for (auto &Call : CallsWithMetadata) {
5049 ContextNode *Node = getNodeForInst(Call);
5050 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5051 continue;
5052 // If Node has enough clones already to cover all function clones, we can
5053 // skip it. Need to add one for the original copy.
5054 // Use >= in case there were clones that were skipped due to having empty
5055 // context ids
5056 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5057 continue;
5058 // First collect all function clones we cloned this callsite node for.
5059 // They may not be sequential due to empty clones e.g.
5060 DenseSet<unsigned> NodeCallClones;
5061 for (auto *C : Node->Clones)
5062 NodeCallClones.insert(C->Call.cloneNo());
5063 unsigned I = 0;
5064 // Now check all the function clones.
5065 for (auto &FC : FuncCloneInfos) {
5066 // Function clones should be sequential.
5067 assert(FC.FuncClone.cloneNo() == I);
5068 // Skip the first clone which got the original call.
5069 // Also skip any other clones created for this Node.
5070 if (++I == 1 || NodeCallClones.contains(I)) {
5071 continue;
5072 }
5073 // Record the call clones created for this callsite in this function
5074 // clone.
5075 auto &CallVector = UnassignedCallClones[Node][I];
5076 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5077 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5078 CallInfo CallClone = It->second;
5079 CallVector.push_back(CallClone);
5080 } else {
5081 // All but the original clone (skipped earlier) should have an entry
5082 // for all calls.
5083 assert(false && "Expected to find call in CallMap");
5084 }
5085 // Need to do the same for all matching calls.
5086 for (auto &MatchingCall : Node->MatchingCalls) {
5087 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5088 CallInfo CallClone = It->second;
5089 CallVector.push_back(CallClone);
5090 } else {
5091 // All but the original clone (skipped earlier) should have an entry
5092 // for all calls.
5093 assert(false && "Expected to find call in CallMap");
5094 }
5095 }
5096 }
5097 }
5098 }
5099
5100 uint8_t BothTypes =
5101 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5102
5103 auto UpdateCalls = [&](ContextNode *Node,
5104 DenseSet<const ContextNode *> &Visited,
5105 auto &&UpdateCalls) {
5106 auto Inserted = Visited.insert(Node);
5107 if (!Inserted.second)
5108 return;
5109
5110 for (auto *Clone : Node->Clones)
5111 UpdateCalls(Clone, Visited, UpdateCalls);
5112
5113 for (auto &Edge : Node->CallerEdges)
5114 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5115
5116 // Skip if either no call to update, or if we ended up with no context ids
5117 // (we moved all edges onto other clones).
5118 if (!Node->hasCall() || Node->emptyContextIds())
5119 return;
5120
5121 if (Node->IsAllocation) {
5122 auto AT = allocTypeToUse(Node->AllocTypes);
5123 // If the allocation type is ambiguous, and more aggressive hinting
5124 // has been enabled via the MinClonedColdBytePercent flag, see if this
5125 // allocation should be hinted cold anyway because its fraction cold bytes
5126 // allocated is at least the given threshold.
5127 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5128 !ContextIdToContextSizeInfos.empty()) {
5129 uint64_t TotalCold = 0;
5130 uint64_t Total = 0;
5131 for (auto Id : Node->getContextIds()) {
5132 auto TypeI = ContextIdToAllocationType.find(Id);
5133 assert(TypeI != ContextIdToAllocationType.end());
5134 auto CSI = ContextIdToContextSizeInfos.find(Id);
5135 if (CSI != ContextIdToContextSizeInfos.end()) {
5136 for (auto &Info : CSI->second) {
5137 Total += Info.TotalSize;
5138 if (TypeI->second == AllocationType::Cold)
5139 TotalCold += Info.TotalSize;
5140 }
5141 }
5142 }
5143 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5144 AT = AllocationType::Cold;
5145 }
5146 updateAllocationCall(Node->Call, AT);
5147 assert(Node->MatchingCalls.empty());
5148 return;
5149 }
5150
5151 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5152 return;
5153
5154 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5155 updateCall(Node->Call, CalleeFunc);
5156 // Update all the matching calls as well.
5157 for (auto &Call : Node->MatchingCalls)
5158 updateCall(Call, CalleeFunc);
5159
5160 // Now update all calls recorded earlier that are still in function clones
5161 // which don't have a clone of this callsite node.
5162 if (!UnassignedCallClones.contains(Node))
5163 return;
5164 DenseSet<unsigned> NodeCallClones;
5165 for (auto *C : Node->Clones)
5166 NodeCallClones.insert(C->Call.cloneNo());
5167 // Note that we already confirmed Node is in this map a few lines above.
5168 auto &ClonedCalls = UnassignedCallClones[Node];
5169 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5170 // Should start at 1 as we never create an entry for original node.
5171 assert(CloneNo > 0);
5172 // If we subsequently created a clone, skip this one.
5173 if (NodeCallClones.contains(CloneNo))
5174 continue;
5175 // Use the original Node's CalleeFunc.
5176 for (auto &Call : CallVector)
5177 updateCall(Call, CalleeFunc);
5178 }
5179 };
5180
5181 // Performs DFS traversal starting from allocation nodes to update calls to
5182 // reflect cloning decisions recorded earlier. For regular LTO this will
5183 // update the actual calls in the IR to call the appropriate function clone
5184 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5185 // are recorded in the summary entries.
5186 DenseSet<const ContextNode *> Visited;
5187 for (auto &Entry : AllocationCallToContextNodeMap)
5188 UpdateCalls(Entry.second, Visited, UpdateCalls);
5189
5190 return Changed;
5191}
5192
5193// Compute a SHA1 hash of the callsite and alloc version information of clone I
5194// in the summary, to use in detection of duplicate clones.
5196 SHA1 Hasher;
5197 // Update hash with any callsites that call non-default (non-zero) callee
5198 // versions.
5199 for (auto &SN : FS->callsites()) {
5200 // In theory all callsites and allocs in this function should have the same
5201 // number of clone entries, but handle any discrepancies gracefully below
5202 // for NDEBUG builds.
5203 assert(
5204 SN.Clones.size() > I &&
5205 "Callsite summary has fewer entries than other summaries in function");
5206 if (SN.Clones.size() <= I || !SN.Clones[I])
5207 continue;
5208 uint8_t Data[sizeof(SN.Clones[I])];
5209 support::endian::write32le(Data, SN.Clones[I]);
5210 Hasher.update(Data);
5211 }
5212 // Update hash with any allocs that have non-default (non-None) hints.
5213 for (auto &AN : FS->allocs()) {
5214 // In theory all callsites and allocs in this function should have the same
5215 // number of clone entries, but handle any discrepancies gracefully below
5216 // for NDEBUG builds.
5217 assert(AN.Versions.size() > I &&
5218 "Alloc summary has fewer entries than other summaries in function");
5219 if (AN.Versions.size() <= I ||
5220 (AllocationType)AN.Versions[I] == AllocationType::None)
5221 continue;
5222 Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
5223 }
5224 return support::endian::read64le(Hasher.result().data());
5225}
5226
5228 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5230 &FuncToAliasMap,
5231 FunctionSummary *FS) {
5232 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5233 // We might have created this when adjusting callsite in another
5234 // function. It should be a declaration.
5235 assert(DeclGV->isDeclaration());
5236 NewGV->takeName(DeclGV);
5237 DeclGV->replaceAllUsesWith(NewGV);
5238 DeclGV->eraseFromParent();
5239 };
5240
5241 // Handle aliases to this function, and create analogous alias clones to the
5242 // provided clone of this function.
5243 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5244 if (!FuncToAliasMap.count(&F))
5245 return;
5246 for (auto *A : FuncToAliasMap[&F]) {
5247 std::string AliasName = getMemProfFuncName(A->getName(), I);
5248 auto *PrevA = M.getNamedAlias(AliasName);
5249 auto *NewA = GlobalAlias::create(A->getValueType(),
5250 A->getType()->getPointerAddressSpace(),
5251 A->getLinkage(), AliasName, NewF);
5252 NewA->copyAttributesFrom(A);
5253 if (PrevA)
5254 TakeDeclNameAndReplace(PrevA, NewA);
5255 }
5256 };
5257
5258 // The first "clone" is the original copy, we should only call this if we
5259 // needed to create new clones.
5260 assert(NumClones > 1);
5262 VMaps.reserve(NumClones - 1);
5263 FunctionsClonedThinBackend++;
5264
5265 // Map of hash of callsite/alloc versions to the instantiated function clone
5266 // (possibly the original) implementing those calls. Used to avoid
5267 // instantiating duplicate function clones.
5268 // FIXME: Ideally the thin link would not generate such duplicate clones to
5269 // start with, but right now it happens due to phase ordering in the function
5270 // assignment and possible new clones that produces. We simply make each
5271 // duplicate an alias to the matching instantiated clone recorded in the map
5272 // (except for available_externally which are made declarations as they would
5273 // be aliases in the prevailing module, and available_externally aliases are
5274 // not well supported right now).
5276
5277 // Save the hash of the original function version.
5278 HashToFunc[ComputeHash(FS, 0)] = &F;
5279
5280 for (unsigned I = 1; I < NumClones; I++) {
5281 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5282 std::string Name = getMemProfFuncName(F.getName(), I);
5283 auto Hash = ComputeHash(FS, I);
5284 // If this clone would duplicate a previously seen clone, don't generate the
5285 // duplicate clone body, just make an alias to satisfy any (potentially
5286 // cross-module) references.
5287 if (HashToFunc.contains(Hash)) {
5288 FunctionCloneDuplicatesThinBackend++;
5289 auto *Func = HashToFunc[Hash];
5290 if (Func->hasAvailableExternallyLinkage()) {
5291 // Skip these as EliminateAvailableExternallyPass does not handle
5292 // available_externally aliases correctly and we end up with an
5293 // available_externally alias to a declaration. Just create a
5294 // declaration for now as we know we will have a definition in another
5295 // module.
5296 auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
5297 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5298 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5299 continue;
5300 }
5301 auto *PrevF = M.getFunction(Name);
5302 auto *Alias = GlobalAlias::create(Name, Func);
5303 if (PrevF)
5304 TakeDeclNameAndReplace(PrevF, Alias);
5305 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5306 << "created clone alias " << ore::NV("Alias", Alias));
5307
5308 // Now handle aliases to this function, and clone those as well.
5309 CloneFuncAliases(Func, I);
5310 continue;
5311 }
5312 auto *NewF = CloneFunction(&F, *VMaps.back());
5313 HashToFunc[Hash] = NewF;
5314 FunctionClonesThinBackend++;
5315 // Strip memprof and callsite metadata from clone as they are no longer
5316 // needed.
5317 for (auto &BB : *NewF) {
5318 for (auto &Inst : BB) {
5319 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5320 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5321 }
5322 }
5323 auto *PrevF = M.getFunction(Name);
5324 if (PrevF)
5325 TakeDeclNameAndReplace(PrevF, NewF);
5326 else
5327 NewF->setName(Name);
5328 updateSubprogramLinkageName(NewF, Name);
5329 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5330 << "created clone " << ore::NV("NewFunction", NewF));
5331
5332 // Now handle aliases to this function, and clone those as well.
5333 CloneFuncAliases(NewF, I);
5334 }
5335 return VMaps;
5336}
5337
5338// Locate the summary for F. This is complicated by the fact that it might
5339// have been internalized or promoted.
5341 const ModuleSummaryIndex *ImportSummary,
5342 const Function *CallingFunc = nullptr) {
5343 // FIXME: Ideally we would retain the original GUID in some fashion on the
5344 // function (e.g. as metadata), but for now do our best to locate the
5345 // summary without that information.
5346 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5347 if (!TheFnVI)
5348 // See if theFn was internalized, by checking index directly with
5349 // original name (this avoids the name adjustment done by getGUID() for
5350 // internal symbols).
5351 TheFnVI = ImportSummary->getValueInfo(
5353 if (TheFnVI)
5354 return TheFnVI;
5355 // Now query with the original name before any promotion was performed.
5356 StringRef OrigName =
5358 // When this pass is enabled, we always add thinlto_src_file provenance
5359 // metadata to imported function definitions, which allows us to recreate the
5360 // original internal symbol's GUID.
5361 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5362 // If this is a call to an imported/promoted local for which we didn't import
5363 // the definition, the metadata will not exist on the declaration. However,
5364 // since we are doing this early, before any inlining in the LTO backend, we
5365 // can simply look at the metadata on the calling function which must have
5366 // been from the same module if F was an internal symbol originally.
5367 if (!SrcFileMD && F.isDeclaration()) {
5368 // We would only call this for a declaration for a direct callsite, in which
5369 // case the caller would have provided the calling function pointer.
5370 assert(CallingFunc);
5371 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5372 // If this is a promoted local (OrigName != F.getName()), since this is a
5373 // declaration, it must be imported from a different module and therefore we
5374 // should always find the metadata on its calling function. Any call to a
5375 // promoted local that came from this module should still be a definition.
5376 assert(SrcFileMD || OrigName == F.getName());
5377 }
5378 StringRef SrcFile = M.getSourceFileName();
5379 if (SrcFileMD)
5380 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5381 std::string OrigId = GlobalValue::getGlobalIdentifier(
5382 OrigName, GlobalValue::InternalLinkage, SrcFile);
5383 TheFnVI = ImportSummary->getValueInfo(
5385 // Internal func in original module may have gotten a numbered suffix if we
5386 // imported an external function with the same name. This happens
5387 // automatically during IR linking for naming conflicts. It would have to
5388 // still be internal in that case (otherwise it would have been renamed on
5389 // promotion in which case we wouldn't have a naming conflict).
5390 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5391 F.getName().contains('.')) {
5392 OrigName = F.getName().rsplit('.').first;
5394 OrigName, GlobalValue::InternalLinkage, SrcFile);
5395 TheFnVI = ImportSummary->getValueInfo(
5397 }
5398 // The only way we may not have a VI is if this is a declaration created for
5399 // an imported reference. For distributed ThinLTO we may not have a VI for
5400 // such declarations in the distributed summary.
5401 assert(TheFnVI || F.isDeclaration());
5402 return TheFnVI;
5403}
5404
5405bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5406 Module &M) {
5407 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5408 Symtab = std::make_unique<InstrProfSymtab>();
5409 // Don't add canonical names, to avoid multiple functions to the symtab
5410 // when they both have the same root name with "." suffixes stripped.
5411 // If we pick the wrong one then this could lead to incorrect ICP and calling
5412 // a memprof clone that we don't actually create (resulting in linker unsats).
5413 // What this means is that the GUID of the function (or its PGOFuncName
5414 // metadata) *must* match that in the VP metadata to allow promotion.
5415 // In practice this should not be a limitation, since local functions should
5416 // have PGOFuncName metadata and global function names shouldn't need any
5417 // special handling (they should not get the ".llvm.*" suffix that the
5418 // canonicalization handling is attempting to strip).
5419 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5420 std::string SymtabFailure = toString(std::move(E));
5421 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5422 return false;
5423 }
5424 return true;
5425}
5426
5427#ifndef NDEBUG
5428// Sanity check that the MIB stack ids match between the summary and
5429// instruction metadata.
5431 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5432 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5433 const ModuleSummaryIndex *ImportSummary) {
5434 auto MIBIter = AllocNode.MIBs.begin();
5435 for (auto &MDOp : MemProfMD->operands()) {
5436 assert(MIBIter != AllocNode.MIBs.end());
5437 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5438 auto *MIBMD = cast<const MDNode>(MDOp);
5439 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5440 assert(StackMDNode);
5441 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5442 auto ContextIterBegin =
5443 StackContext.beginAfterSharedPrefix(CallsiteContext);
5444 // Skip the checking on the first iteration.
5445 uint64_t LastStackContextId =
5446 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5447 : 0;
5448 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5449 ++ContextIter) {
5450 // If this is a direct recursion, simply skip the duplicate
5451 // entries, to be consistent with how the summary ids were
5452 // generated during ModuleSummaryAnalysis.
5453 if (LastStackContextId == *ContextIter)
5454 continue;
5455 LastStackContextId = *ContextIter;
5456 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5457 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5458 *ContextIter);
5459 StackIdIndexIter++;
5460 }
5461 MIBIter++;
5462 }
5463}
5464#endif
5465
5466bool MemProfContextDisambiguation::applyImport(Module &M) {
5467 assert(ImportSummary);
5468 bool Changed = false;
5469
5470 // We also need to clone any aliases that reference cloned functions, because
5471 // the modified callsites may invoke via the alias. Keep track of the aliases
5472 // for each function.
5473 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5474 FuncToAliasMap;
5475 for (auto &A : M.aliases()) {
5476 auto *Aliasee = A.getAliaseeObject();
5477 if (auto *F = dyn_cast<Function>(Aliasee))
5478 FuncToAliasMap[F].insert(&A);
5479 }
5480
5481 if (!initializeIndirectCallPromotionInfo(M))
5482 return false;
5483
5484 for (auto &F : M) {
5485 if (F.isDeclaration() || isMemProfClone(F))
5486 continue;
5487
5488 OptimizationRemarkEmitter ORE(&F);
5489
5491 bool ClonesCreated = false;
5492 unsigned NumClonesCreated = 0;
5493 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5494 // We should at least have version 0 which is the original copy.
5495 assert(NumClones > 0);
5496 // If only one copy needed use original.
5497 if (NumClones == 1)
5498 return;
5499 // If we already performed cloning of this function, confirm that the
5500 // requested number of clones matches (the thin link should ensure the
5501 // number of clones for each constituent callsite is consistent within
5502 // each function), before returning.
5503 if (ClonesCreated) {
5504 assert(NumClonesCreated == NumClones);
5505 return;
5506 }
5507 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5508 // The first "clone" is the original copy, which doesn't have a VMap.
5509 assert(VMaps.size() == NumClones - 1);
5510 Changed = true;
5511 ClonesCreated = true;
5512 NumClonesCreated = NumClones;
5513 };
5514
5515 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5516 Function *CalledFunction, FunctionSummary *FS) {
5517 // Perform cloning if not yet done.
5518 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5519
5520 assert(!isMemProfClone(*CalledFunction));
5521
5522 // Because we update the cloned calls by calling setCalledOperand (see
5523 // comment below), out of an abundance of caution make sure the called
5524 // function was actually the called operand (or its aliasee). We also
5525 // strip pointer casts when looking for calls (to match behavior during
5526 // summary generation), however, with opaque pointers in theory this
5527 // should not be an issue. Note we still clone the current function
5528 // (containing this call) above, as that could be needed for its callers.
5529 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5530 if (CalledFunction != CB->getCalledOperand() &&
5531 (!GA || CalledFunction != GA->getAliaseeObject())) {
5532 SkippedCallsCloning++;
5533 return;
5534 }
5535 // Update the calls per the summary info.
5536 // Save orig name since it gets updated in the first iteration
5537 // below.
5538 auto CalleeOrigName = CalledFunction->getName();
5539 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5540 // If the VMap is empty, this clone was a duplicate of another and was
5541 // created as an alias or a declaration.
5542 if (J > 0 && VMaps[J - 1]->empty())
5543 continue;
5544 // Do nothing if this version calls the original version of its
5545 // callee.
5546 if (!StackNode.Clones[J])
5547 continue;
5548 auto NewF = M.getOrInsertFunction(
5549 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5550 CalledFunction->getFunctionType());
5551 CallBase *CBClone;
5552 // Copy 0 is the original function.
5553 if (!J)
5554 CBClone = CB;
5555 else
5556 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5557 // Set the called operand directly instead of calling setCalledFunction,
5558 // as the latter mutates the function type on the call. In rare cases
5559 // we may have a slightly different type on a callee function
5560 // declaration due to it being imported from a different module with
5561 // incomplete types. We really just want to change the name of the
5562 // function to the clone, and not make any type changes.
5563 CBClone->setCalledOperand(NewF.getCallee());
5564 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5565 << ore::NV("Call", CBClone) << " in clone "
5566 << ore::NV("Caller", CBClone->getFunction())
5567 << " assigned to call function clone "
5568 << ore::NV("Callee", NewF.getCallee()));
5569 }
5570 };
5571
5572 // Locate the summary for F.
5573 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5574 // If not found, this could be an imported local (see comment in
5575 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5576 // module (where it would have been promoted to global scope so should
5577 // satisfy any reference in this module).
5578 if (!TheFnVI)
5579 continue;
5580
5581 auto *GVSummary =
5582 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5583 if (!GVSummary) {
5584 // Must have been imported, use the summary which matches the definition。
5585 // (might be multiple if this was a linkonce_odr).
5586 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5587 assert(SrcModuleMD &&
5588 "enable-import-metadata is needed to emit thinlto_src_module");
5589 StringRef SrcModule =
5590 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5591 for (auto &GVS : TheFnVI.getSummaryList()) {
5592 if (GVS->modulePath() == SrcModule) {
5593 GVSummary = GVS.get();
5594 break;
5595 }
5596 }
5597 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5598 }
5599
5600 // If this was an imported alias skip it as we won't have the function
5601 // summary, and it should be cloned in the original module.
5602 if (isa<AliasSummary>(GVSummary))
5603 continue;
5604
5605 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5606
5607 if (FS->allocs().empty() && FS->callsites().empty())
5608 continue;
5609
5610 auto SI = FS->callsites().begin();
5611 auto AI = FS->allocs().begin();
5612
5613 // To handle callsite infos synthesized for tail calls which have missing
5614 // frames in the profiled context, map callee VI to the synthesized callsite
5615 // info.
5616 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5617 // Iterate the callsites for this function in reverse, since we place all
5618 // those synthesized for tail calls at the end.
5619 for (auto CallsiteIt = FS->callsites().rbegin();
5620 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5621 auto &Callsite = *CallsiteIt;
5622 // Stop as soon as we see a non-synthesized callsite info (see comment
5623 // above loop). All the entries added for discovered tail calls have empty
5624 // stack ids.
5625 if (!Callsite.StackIdIndices.empty())
5626 break;
5627 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5628 }
5629
5630 // Keeps track of needed ICP for the function.
5631 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5632
5633 // Assume for now that the instructions are in the exact same order
5634 // as when the summary was created, but confirm this is correct by
5635 // matching the stack ids.
5636 for (auto &BB : F) {
5637 for (auto &I : BB) {
5638 auto *CB = dyn_cast<CallBase>(&I);
5639 // Same handling as when creating module summary.
5640 if (!mayHaveMemprofSummary(CB))
5641 continue;
5642
5643 auto *CalledValue = CB->getCalledOperand();
5644 auto *CalledFunction = CB->getCalledFunction();
5645 if (CalledValue && !CalledFunction) {
5646 CalledValue = CalledValue->stripPointerCasts();
5647 // Stripping pointer casts can reveal a called function.
5648 CalledFunction = dyn_cast<Function>(CalledValue);
5649 }
5650 // Check if this is an alias to a function. If so, get the
5651 // called aliasee for the checks below.
5652 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5653 assert(!CalledFunction &&
5654 "Expected null called function in callsite for alias");
5655 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5656 }
5657
5658 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5659 I.getMetadata(LLVMContext::MD_callsite));
5660 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5661
5662 // Include allocs that were already assigned a memprof function
5663 // attribute in the statistics. Only do this for those that do not have
5664 // memprof metadata, since we add an "ambiguous" memprof attribute by
5665 // default.
5666 if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
5667 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5668 ? AllocTypeColdThinBackend++
5669 : AllocTypeNotColdThinBackend++;
5670 OrigAllocsThinBackend++;
5671 AllocVersionsThinBackend++;
5672 if (!MaxAllocVersionsThinBackend)
5673 MaxAllocVersionsThinBackend = 1;
5674 continue;
5675 }
5676
5677 if (MemProfMD) {
5678 // Consult the next alloc node.
5679 assert(AI != FS->allocs().end());
5680 auto &AllocNode = *(AI++);
5681
5682#ifndef NDEBUG
5683 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5684 ImportSummary);
5685#endif
5686
5687 // Perform cloning if not yet done.
5688 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5689
5690 OrigAllocsThinBackend++;
5691 AllocVersionsThinBackend += AllocNode.Versions.size();
5692 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5693 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5694
5695 // If there is only one version that means we didn't end up
5696 // considering this function for cloning, and in that case the alloc
5697 // will still be none type or should have gotten the default NotCold.
5698 // Skip that after calling clone helper since that does some sanity
5699 // checks that confirm we haven't decided yet that we need cloning.
5700 // We might have a single version that is cold due to the
5701 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5702 // case.
5703 if (AllocNode.Versions.size() == 1 &&
5704 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5705 assert((AllocationType)AllocNode.Versions[0] ==
5706 AllocationType::NotCold ||
5707 (AllocationType)AllocNode.Versions[0] ==
5708 AllocationType::None);
5709 UnclonableAllocsThinBackend++;
5710 continue;
5711 }
5712
5713 // All versions should have a singular allocation type.
5714 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5715 return Type == ((uint8_t)AllocationType::NotCold |
5716 (uint8_t)AllocationType::Cold);
5717 }));
5718
5719 // Update the allocation types per the summary info.
5720 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5721 // If the VMap is empty, this clone was a duplicate of another and
5722 // was created as an alias or a declaration.
5723 if (J > 0 && VMaps[J - 1]->empty())
5724 continue;
5725 // Ignore any that didn't get an assigned allocation type.
5726 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5727 continue;
5728 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5729 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5730 : AllocTypeNotColdThinBackend++;
5731 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
5732 auto A = llvm::Attribute::get(F.getContext(), "memprof",
5733 AllocTypeString);
5734 CallBase *CBClone;
5735 // Copy 0 is the original function.
5736 if (!J)
5737 CBClone = CB;
5738 else
5739 // Since VMaps are only created for new clones, we index with
5740 // clone J-1 (J==0 is the original clone and does not have a VMaps
5741 // entry).
5742 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5744 CBClone->addFnAttr(A);
5745 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5746 << ore::NV("AllocationCall", CBClone) << " in clone "
5747 << ore::NV("Caller", CBClone->getFunction())
5748 << " marked with memprof allocation attribute "
5749 << ore::NV("Attribute", AllocTypeString));
5750 }
5751 } else if (!CallsiteContext.empty()) {
5752 if (!CalledFunction) {
5753#ifndef NDEBUG
5754 // We should have skipped inline assembly calls.
5755 auto *CI = dyn_cast<CallInst>(CB);
5756 assert(!CI || !CI->isInlineAsm());
5757#endif
5758 // We should have skipped direct calls via a Constant.
5759 assert(CalledValue && !isa<Constant>(CalledValue));
5760
5761 // This is an indirect call, see if we have profile information and
5762 // whether any clones were recorded for the profiled targets (that
5763 // we synthesized CallsiteInfo summary records for when building the
5764 // index).
5765 auto NumClones =
5766 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
5767
5768 // Perform cloning if not yet done. This is done here in case
5769 // we don't need to do ICP, but might need to clone this
5770 // function as it is the target of other cloned calls.
5771 if (NumClones)
5772 CloneFuncIfNeeded(NumClones, FS);
5773 }
5774
5775 else {
5776 // Consult the next callsite node.
5777 assert(SI != FS->callsites().end());
5778 auto &StackNode = *(SI++);
5779
5780#ifndef NDEBUG
5781 // Sanity check that the stack ids match between the summary and
5782 // instruction metadata.
5783 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5784 for (auto StackId : CallsiteContext) {
5785 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5786 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5787 StackId);
5788 StackIdIndexIter++;
5789 }
5790#endif
5791
5792 CloneCallsite(StackNode, CB, CalledFunction, FS);
5793 }
5794 } else if (CB->isTailCall() && CalledFunction) {
5795 // Locate the synthesized callsite info for the callee VI, if any was
5796 // created, and use that for cloning.
5797 ValueInfo CalleeVI =
5798 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
5799 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
5800 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
5801 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5802 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
5803 }
5804 }
5805 }
5806 }
5807
5808 // Now do any promotion required for cloning.
5809 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5810 }
5811
5812 // We skip some of the functions and instructions above, so remove all the
5813 // metadata in a single sweep here.
5814 for (auto &F : M) {
5815 // We can skip memprof clones because createFunctionClones already strips
5816 // the metadata from the newly created clones.
5817 if (F.isDeclaration() || isMemProfClone(F))
5818 continue;
5819 for (auto &BB : F) {
5820 for (auto &I : BB) {
5821 if (!isa<CallBase>(I))
5822 continue;
5823 I.setMetadata(LLVMContext::MD_memprof, nullptr);
5824 I.setMetadata(LLVMContext::MD_callsite, nullptr);
5825 }
5826 }
5827 }
5828
5829 return Changed;
5830}
5831
5832unsigned MemProfContextDisambiguation::recordICPInfo(
5833 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5835 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5836 // First see if we have profile information for this indirect call.
5837 uint32_t NumCandidates;
5838 uint64_t TotalCount;
5839 auto CandidateProfileData =
5840 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
5841 NumCandidates);
5842 if (CandidateProfileData.empty())
5843 return 0;
5844
5845 // Iterate through all of the candidate profiled targets along with the
5846 // CallsiteInfo summary records synthesized for them when building the index,
5847 // and see if any are cloned and/or refer to clones.
5848 bool ICPNeeded = false;
5849 unsigned NumClones = 0;
5850 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
5851 for (const auto &Candidate : CandidateProfileData) {
5852#ifndef NDEBUG
5853 auto CalleeValueInfo =
5854#endif
5855 ImportSummary->getValueInfo(Candidate.Value);
5856 // We might not have a ValueInfo if this is a distributed
5857 // ThinLTO backend and decided not to import that function.
5858 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
5859 assert(SI != AllCallsites.end());
5860 auto &StackNode = *(SI++);
5861 // See if any of the clones of the indirect callsite for this
5862 // profiled target should call a cloned version of the profiled
5863 // target. We only need to do the ICP here if so.
5864 ICPNeeded |= llvm::any_of(StackNode.Clones,
5865 [](unsigned CloneNo) { return CloneNo != 0; });
5866 // Every callsite in the same function should have been cloned the same
5867 // number of times.
5868 assert(!NumClones || NumClones == StackNode.Clones.size());
5869 NumClones = StackNode.Clones.size();
5870 }
5871 if (!ICPNeeded)
5872 return NumClones;
5873 // Save information for ICP, which is performed later to avoid messing up the
5874 // current function traversal.
5875 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
5876 TotalCount, CallsiteInfoStartIndex});
5877 return NumClones;
5878}
5879
5880void MemProfContextDisambiguation::performICP(
5881 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5882 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5883 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5884 OptimizationRemarkEmitter &ORE) {
5885 // Now do any promotion required for cloning. Specifically, for each
5886 // recorded ICP candidate (which was only recorded because one clone of that
5887 // candidate should call a cloned target), we perform ICP (speculative
5888 // devirtualization) for each clone of the callsite, and update its callee
5889 // to the appropriate clone. Note that the ICP compares against the original
5890 // version of the target, which is what is in the vtable.
5891 for (auto &Info : ICallAnalysisInfo) {
5892 auto *CB = Info.CB;
5893 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5894 auto TotalCount = Info.TotalCount;
5895 unsigned NumPromoted = 0;
5896 unsigned NumClones = 0;
5897
5898 for (auto &Candidate : Info.CandidateProfileData) {
5899 auto &StackNode = AllCallsites[CallsiteIndex++];
5900
5901 // All calls in the same function must have the same number of clones.
5902 assert(!NumClones || NumClones == StackNode.Clones.size());
5903 NumClones = StackNode.Clones.size();
5904
5905 // See if the target is in the module. If it wasn't imported, it is
5906 // possible that this profile could have been collected on a different
5907 // target (or version of the code), and we need to be conservative
5908 // (similar to what is done in the ICP pass).
5909 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
5910 if (TargetFunction == nullptr ||
5911 // Any ThinLTO global dead symbol removal should have already
5912 // occurred, so it should be safe to promote when the target is a
5913 // declaration.
5914 // TODO: Remove internal option once more fully tested.
5916 TargetFunction->isDeclaration())) {
5917 ORE.emit([&]() {
5918 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
5919 << "Memprof cannot promote indirect call: target with md5sum "
5920 << ore::NV("target md5sum", Candidate.Value) << " not found";
5921 });
5922 // FIXME: See if we can use the new declaration importing support to
5923 // at least get the declarations imported for this case. Hot indirect
5924 // targets should have been imported normally, however.
5925 continue;
5926 }
5927
5928 // Check if legal to promote
5929 const char *Reason = nullptr;
5930 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
5931 ORE.emit([&]() {
5932 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
5933 << "Memprof cannot promote indirect call to "
5934 << ore::NV("TargetFunction", TargetFunction)
5935 << " with count of " << ore::NV("TotalCount", TotalCount)
5936 << ": " << Reason;
5937 });
5938 continue;
5939 }
5940
5941 assert(!isMemProfClone(*TargetFunction));
5942
5943 // Handle each call clone, applying ICP so that each clone directly
5944 // calls the specified callee clone, guarded by the appropriate ICP
5945 // check.
5946 CallBase *CBClone = CB;
5947 for (unsigned J = 0; J < NumClones; J++) {
5948 // If the VMap is empty, this clone was a duplicate of another and was
5949 // created as an alias or a declaration.
5950 if (J > 0 && VMaps[J - 1]->empty())
5951 continue;
5952 // Copy 0 is the original function.
5953 if (J > 0)
5954 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5955 // We do the promotion using the original name, so that the comparison
5956 // is against the name in the vtable. Then just below, change the new
5957 // direct call to call the cloned function.
5958 auto &DirectCall =
5959 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
5960 TotalCount, isSamplePGO, &ORE);
5961 auto *TargetToUse = TargetFunction;
5962 // Call original if this version calls the original version of its
5963 // callee.
5964 if (StackNode.Clones[J]) {
5965 TargetToUse =
5966 cast<Function>(M.getOrInsertFunction(
5967 getMemProfFuncName(TargetFunction->getName(),
5968 StackNode.Clones[J]),
5969 TargetFunction->getFunctionType())
5970 .getCallee());
5971 }
5972 DirectCall.setCalledFunction(TargetToUse);
5973 // During matching we generate synthetic VP metadata for indirect calls
5974 // not already having any, from the memprof profile's callee GUIDs. If
5975 // we subsequently promote and inline those callees, we currently lose
5976 // the ability to generate this synthetic VP metadata. Optionally apply
5977 // a noinline attribute to promoted direct calls, where the threshold is
5978 // set to capture synthetic VP metadata targets which get a count of 1.
5980 Candidate.Count < MemProfICPNoInlineThreshold)
5981 DirectCall.setIsNoInline();
5982 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5983 << ore::NV("Call", CBClone) << " in clone "
5984 << ore::NV("Caller", CBClone->getFunction())
5985 << " promoted and assigned to call function clone "
5986 << ore::NV("Callee", TargetToUse));
5987 }
5988
5989 // Update TotalCount (all clones should get same count above)
5990 TotalCount -= Candidate.Count;
5991 NumPromoted++;
5992 }
5993 // Adjust the MD.prof metadata for all clones, now that we have the new
5994 // TotalCount and the number promoted.
5995 CallBase *CBClone = CB;
5996 for (unsigned J = 0; J < NumClones; J++) {
5997 // If the VMap is empty, this clone was a duplicate of another and was
5998 // created as an alias or a declaration.
5999 if (J > 0 && VMaps[J - 1]->empty())
6000 continue;
6001 // Copy 0 is the original function.
6002 if (J > 0)
6003 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6004 // First delete the old one.
6005 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
6006 // If all promoted, we don't need the MD.prof metadata.
6007 // Otherwise we need update with the un-promoted records back.
6008 if (TotalCount != 0)
6010 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
6011 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
6012 }
6013 }
6014}
6015
6016template <typename DerivedCCG, typename FuncTy, typename CallTy>
6017bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
6018 if (DumpCCG) {
6019 dbgs() << "CCG before cloning:\n";
6020 dbgs() << *this;
6021 }
6022 if (ExportToDot)
6023 exportToDot("postbuild");
6024
6025 if (VerifyCCG) {
6026 check();
6027 }
6028
6029 identifyClones();
6030
6031 if (VerifyCCG) {
6032 check();
6033 }
6034
6035 if (DumpCCG) {
6036 dbgs() << "CCG after cloning:\n";
6037 dbgs() << *this;
6038 }
6039 if (ExportToDot)
6040 exportToDot("cloned");
6041
6042 bool Changed = assignFunctions();
6043
6044 if (DumpCCG) {
6045 dbgs() << "CCG after assigning function clones:\n";
6046 dbgs() << *this;
6047 }
6048 if (ExportToDot)
6049 exportToDot("clonefuncassign");
6050
6052 printTotalSizes(errs());
6053
6054 return Changed;
6055}
6056
6057bool MemProfContextDisambiguation::processModule(
6058 Module &M,
6059 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6060
6061 // If we have an import summary, then the cloning decisions were made during
6062 // the thin link on the index. Apply them and return.
6063 if (ImportSummary)
6064 return applyImport(M);
6065
6066 // TODO: If/when other types of memprof cloning are enabled beyond just for
6067 // hot and cold, we will need to change this to individually control the
6068 // AllocationType passed to addStackNodesForMIB during CCG construction.
6069 // Note that we specifically check this after applying imports above, so that
6070 // the option isn't needed to be passed to distributed ThinLTO backend
6071 // clang processes, which won't necessarily have visibility into the linker
6072 // dependences. Instead the information is communicated from the LTO link to
6073 // the backends via the combined summary index.
6074 if (!SupportsHotColdNew)
6075 return false;
6076
6077 ModuleCallsiteContextGraph CCG(M, OREGetter);
6078 return CCG.process();
6079}
6080
6082 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6083 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6084 // Check the dot graph printing options once here, to make sure we have valid
6085 // and expected combinations.
6086 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6088 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6090 !ContextIdForDot.getNumOccurrences())
6092 "-memprof-dot-scope=context requires -memprof-dot-context-id");
6093 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6094 ContextIdForDot.getNumOccurrences())
6096 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6097 "-memprof-dot-context-id");
6098 if (ImportSummary) {
6099 // The MemProfImportSummary should only be used for testing ThinLTO
6100 // distributed backend handling via opt, in which case we don't have a
6101 // summary from the pass pipeline.
6103 return;
6104 }
6105 if (MemProfImportSummary.empty())
6106 return;
6107
6108 auto ReadSummaryFile =
6110 if (!ReadSummaryFile) {
6111 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
6112 "Error loading file '" + MemProfImportSummary +
6113 "': ");
6114 return;
6115 }
6116 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
6117 if (!ImportSummaryForTestingOrErr) {
6118 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
6119 "Error parsing file '" + MemProfImportSummary +
6120 "': ");
6121 return;
6122 }
6123 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6124 ImportSummary = ImportSummaryForTesting.get();
6125}
6126
6129 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
6130 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6131 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
6132 };
6133 if (!processModule(M, OREGetter))
6134 return PreservedAnalyses::all();
6135 return PreservedAnalyses::none();
6136}
6137
6139 ModuleSummaryIndex &Index,
6141 isPrevailing) {
6142 // TODO: If/when other types of memprof cloning are enabled beyond just for
6143 // hot and cold, we will need to change this to individually control the
6144 // AllocationType passed to addStackNodesForMIB during CCG construction.
6145 // The index was set from the option, so these should be in sync.
6146 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6147 if (!SupportsHotColdNew)
6148 return;
6149
6150 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6151 CCG.process();
6152}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
uint64_t ComputeHash(const FunctionSummary *FS, unsigned I)
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap, FunctionSummary *FS)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
void print(OutputBuffer &OB) const
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
iterator end() const
Definition ArrayRef.h:136
const_pointer iterator
Definition ArrayRef.h:48
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:163
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:158
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
DISubprogram * getSubprogram() const
Get the attached subprogram.
const Function & getFunction() const
Definition Function.h:164
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:598
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:77
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:328
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:93
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:161
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:669
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1317
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:145
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A class that wrap the SHA1 algorithm.
Definition SHA1.h:27
LLVM_ABI void update(ArrayRef< uint8_t > Data)
Digest more data.
Definition SHA1.cpp:208
LLVM_ABI std::array< uint8_t, 20 > result()
Return the current raw 160-bits SHA1 for the digested data since the last call to init().
Definition SHA1.cpp:288
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:695
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
uint64_t read64le(const void *P)
Definition Endian.h:435
void write32le(void *P, uint32_t V)
Definition Endian.h:475
This is an optimization pass for GlobalISel generic memory operations.
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:65
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2060
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:733
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34