LLVM 22.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/Module.h"
39#include "llvm/Pass.h"
44#include "llvm/Transforms/IPO.h"
48#include <deque>
49#include <sstream>
50#include <unordered_map>
51#include <vector>
52using namespace llvm;
53using namespace llvm::memprof;
54
55#define DEBUG_TYPE "memprof-context-disambiguation"
56
57STATISTIC(FunctionClonesAnalysis,
58 "Number of function clones created during whole program analysis");
59STATISTIC(FunctionClonesThinBackend,
60 "Number of function clones created during ThinLTO backend");
61STATISTIC(FunctionsClonedThinBackend,
62 "Number of functions that had clones created during ThinLTO backend");
63STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
64 "cloned) during whole program analysis");
65STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
66 "during whole program analysis");
67STATISTIC(AllocTypeNotColdThinBackend,
68 "Number of not cold static allocations (possibly cloned) during "
69 "ThinLTO backend");
70STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
71 "(possibly cloned) during ThinLTO backend");
72STATISTIC(OrigAllocsThinBackend,
73 "Number of original (not cloned) allocations with memprof profiles "
74 "during ThinLTO backend");
76 AllocVersionsThinBackend,
77 "Number of allocation versions (including clones) during ThinLTO backend");
78STATISTIC(MaxAllocVersionsThinBackend,
79 "Maximum number of allocation versions created for an original "
80 "allocation during ThinLTO backend");
81STATISTIC(UnclonableAllocsThinBackend,
82 "Number of unclonable ambigous allocations during ThinLTO backend");
83STATISTIC(RemovedEdgesWithMismatchedCallees,
84 "Number of edges removed due to mismatched callees (profiled vs IR)");
85STATISTIC(FoundProfiledCalleeCount,
86 "Number of profiled callees found via tail calls");
87STATISTIC(FoundProfiledCalleeDepth,
88 "Aggregate depth of profiled callees found via tail calls");
89STATISTIC(FoundProfiledCalleeMaxDepth,
90 "Maximum depth of profiled callees found via tail calls");
91STATISTIC(FoundProfiledCalleeNonUniquelyCount,
92 "Number of profiled callees found via multiple tail call chains");
93STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
94STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
95STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
96STATISTIC(MissingAllocForContextId,
97 "Number of missing alloc nodes for context ids");
98STATISTIC(SkippedCallsCloning,
99 "Number of calls skipped during cloning due to unexpected operand");
100STATISTIC(MismatchedCloneAssignments,
101 "Number of callsites assigned to call multiple non-matching clones");
102STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
103STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
104STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
105
107 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
108 cl::value_desc("filename"),
109 cl::desc("Specify the path prefix of the MemProf dot files."));
110
111static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
113 cl::desc("Export graph to dot files."));
114
115// TODO: Remove this option once new handling is validated more widely.
117 "memprof-merge-iteration", cl::init(true), cl::Hidden,
118 cl::desc("Iteratively apply merging on a node to catch new callers"));
119
120// How much of the graph to export to dot.
122 All, // The full CCG graph.
123 Alloc, // Only contexts for the specified allocation.
124 Context, // Only the specified context.
125};
126
128 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
131 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
133 "Export only nodes with contexts feeding given "
134 "-memprof-dot-alloc-id"),
135 clEnumValN(DotScope::Context, "context",
136 "Export only nodes with given -memprof-dot-context-id")));
137
139 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
140 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
141 "or to highlight if -memprof-dot-scope=all"));
142
144 "memprof-dot-context-id", cl::init(0), cl::Hidden,
145 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
146 "highlight otherwise"));
147
148static cl::opt<bool>
149 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
150 cl::desc("Dump CallingContextGraph to stdout after each stage."));
151
152static cl::opt<bool>
153 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
154 cl::desc("Perform verification checks on CallingContextGraph."));
155
156static cl::opt<bool>
157 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
158 cl::desc("Perform frequent verification checks on nodes."));
159
161 "memprof-import-summary",
162 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
163 cl::Hidden);
164
166 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
168 cl::desc("Max depth to recursively search for missing "
169 "frames through tail calls."));
170
171// Optionally enable cloning of callsites involved with recursive cycles
173 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
174 cl::desc("Allow cloning of callsites involved in recursive cycles"));
175
177 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
178 cl::desc("Allow cloning of contexts through recursive cycles"));
179
180// Generally this is needed for correct assignment of allocation clones to
181// function clones, however, allow it to be disabled for debugging while the
182// functionality is new and being tested more widely.
183static cl::opt<bool>
184 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
185 cl::desc("Merge clones before assigning functions"));
186
187// When disabled, try to detect and prevent cloning of recursive contexts.
188// This is only necessary until we support cloning through recursive cycles.
189// Leave on by default for now, as disabling requires a little bit of compile
190// time overhead and doesn't affect correctness, it will just inflate the cold
191// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
193 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
194 cl::desc("Allow cloning of contexts having recursive cycles"));
195
196// Set the minimum absolute count threshold for allowing inlining of indirect
197// calls promoted during cloning.
199 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
200 cl::desc("Minimum absolute count for promoted target to be inlinable"));
201
202namespace llvm {
204 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
205 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
206
207// Indicate we are linking with an allocator that supports hot/cold operator
208// new interfaces.
210 "supports-hot-cold-new", cl::init(false), cl::Hidden,
211 cl::desc("Linking with hot/cold operator new interfaces"));
212
214 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
215 cl::desc(
216 "Require target function definition when promoting indirect calls"));
217
220
221} // namespace llvm
222
223namespace {
224/// CRTP base for graphs built from either IR or ThinLTO summary index.
225///
226/// The graph represents the call contexts in all memprof metadata on allocation
227/// calls, with nodes for the allocations themselves, as well as for the calls
228/// in each context. The graph is initially built from the allocation memprof
229/// metadata (or summary) MIBs. It is then updated to match calls with callsite
230/// metadata onto the nodes, updating it to reflect any inlining performed on
231/// those calls.
232///
233/// Each MIB (representing an allocation's call context with allocation
234/// behavior) is assigned a unique context id during the graph build. The edges
235/// and nodes in the graph are decorated with the context ids they carry. This
236/// is used to correctly update the graph when cloning is performed so that we
237/// can uniquify the context for a single (possibly cloned) allocation.
238template <typename DerivedCCG, typename FuncTy, typename CallTy>
239class CallsiteContextGraph {
240public:
241 CallsiteContextGraph() = default;
242 CallsiteContextGraph(const CallsiteContextGraph &) = default;
243 CallsiteContextGraph(CallsiteContextGraph &&) = default;
244
245 /// Main entry point to perform analysis and transformations on graph.
246 bool process();
247
248 /// Perform cloning on the graph necessary to uniquely identify the allocation
249 /// behavior of an allocation based on its context.
250 void identifyClones();
251
252 /// Assign callsite clones to functions, cloning functions as needed to
253 /// accommodate the combinations of their callsite clones reached by callers.
254 /// For regular LTO this clones functions and callsites in the IR, but for
255 /// ThinLTO the cloning decisions are noted in the summaries and later applied
256 /// in applyImport.
257 bool assignFunctions();
258
259 void dump() const;
260 void print(raw_ostream &OS) const;
261 void printTotalSizes(raw_ostream &OS) const;
262
264 const CallsiteContextGraph &CCG) {
265 CCG.print(OS);
266 return OS;
267 }
268
269 friend struct GraphTraits<
270 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
271 friend struct DOTGraphTraits<
272 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
273
274 void exportToDot(std::string Label) const;
275
276 /// Represents a function clone via FuncTy pointer and clone number pair.
277 struct FuncInfo final
278 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
279 using Base = std::pair<FuncTy *, unsigned>;
280 FuncInfo(const Base &B) : Base(B) {}
281 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
282 explicit operator bool() const { return this->first != nullptr; }
283 FuncTy *func() const { return this->first; }
284 unsigned cloneNo() const { return this->second; }
285 };
286
287 /// Represents a callsite clone via CallTy and clone number pair.
288 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
289 using Base = std::pair<CallTy, unsigned>;
290 CallInfo(const Base &B) : Base(B) {}
291 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
292 : Base(Call, CloneNo) {}
293 explicit operator bool() const { return (bool)this->first; }
294 CallTy call() const { return this->first; }
295 unsigned cloneNo() const { return this->second; }
296 void setCloneNo(unsigned N) { this->second = N; }
297 void print(raw_ostream &OS) const {
298 if (!operator bool()) {
299 assert(!cloneNo());
300 OS << "null Call";
301 return;
302 }
303 call()->print(OS);
304 OS << "\t(clone " << cloneNo() << ")";
305 }
306 void dump() const {
307 print(dbgs());
308 dbgs() << "\n";
309 }
310 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
311 Call.print(OS);
312 return OS;
313 }
314 };
315
316 struct ContextEdge;
317
318 /// Node in the Callsite Context Graph
319 struct ContextNode {
320 // Assigned to nodes as they are created, useful for debugging.
321 unsigned NodeId = 0;
322
323 // Keep this for now since in the IR case where we have an Instruction* it
324 // is not as immediately discoverable. Used for printing richer information
325 // when dumping graph.
326 bool IsAllocation;
327
328 // Keeps track of when the Call was reset to null because there was
329 // recursion.
330 bool Recursive = false;
331
332 // This will be formed by ORing together the AllocationType enum values
333 // for contexts including this node.
334 uint8_t AllocTypes = 0;
335
336 // The corresponding allocation or interior call. This is the primary call
337 // for which we have created this node.
338 CallInfo Call;
339
340 // List of other calls that can be treated the same as the primary call
341 // through cloning. I.e. located in the same function and have the same
342 // (possibly pruned) stack ids. They will be updated the same way as the
343 // primary call when assigning to function clones.
344 SmallVector<CallInfo, 0> MatchingCalls;
345
346 // For alloc nodes this is a unique id assigned when constructed, and for
347 // callsite stack nodes it is the original stack id when the node is
348 // constructed from the memprof MIB metadata on the alloc nodes. Note that
349 // this is only used when matching callsite metadata onto the stack nodes
350 // created when processing the allocation memprof MIBs, and for labeling
351 // nodes in the dot graph. Therefore we don't bother to assign a value for
352 // clones.
353 uint64_t OrigStackOrAllocId = 0;
354
355 // Edges to all callees in the profiled call stacks.
356 // TODO: Should this be a map (from Callee node) for more efficient lookup?
357 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
358
359 // Edges to all callers in the profiled call stacks.
360 // TODO: Should this be a map (from Caller node) for more efficient lookup?
361 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
362
363 // Returns true if we need to look at the callee edges for determining the
364 // node context ids and allocation type.
365 bool useCallerEdgesForContextInfo() const {
366 // Typically if the callee edges are empty either the caller edges are
367 // also empty, or this is an allocation (leaf node). However, if we are
368 // allowing recursive callsites and contexts this will be violated for
369 // incompletely cloned recursive cycles.
370 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
372 // When cloning for a recursive context, during cloning we might be in the
373 // midst of cloning for a recurrence and have moved context ids off of a
374 // caller edge onto the clone but not yet off of the incoming caller
375 // (back) edge. If we don't look at those we miss the fact that this node
376 // still has context ids of interest.
377 return IsAllocation || CloneRecursiveContexts;
378 }
379
380 // Compute the context ids for this node from the union of its edge context
381 // ids.
382 DenseSet<uint32_t> getContextIds() const {
383 unsigned Count = 0;
384 // Compute the number of ids for reserve below. In general we only need to
385 // look at one set of edges, typically the callee edges, since other than
386 // allocations and in some cases during recursion cloning, all the context
387 // ids on the callers should also flow out via callee edges.
388 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
389 Count += Edge->getContextIds().size();
390 DenseSet<uint32_t> ContextIds;
391 ContextIds.reserve(Count);
393 CalleeEdges, useCallerEdgesForContextInfo()
394 ? CallerEdges
395 : std::vector<std::shared_ptr<ContextEdge>>());
396 for (const auto &Edge : Edges)
397 ContextIds.insert_range(Edge->getContextIds());
398 return ContextIds;
399 }
400
401 // Compute the allocation type for this node from the OR of its edge
402 // allocation types.
403 uint8_t computeAllocType() const {
404 uint8_t BothTypes =
408 CalleeEdges, useCallerEdgesForContextInfo()
409 ? CallerEdges
410 : std::vector<std::shared_ptr<ContextEdge>>());
411 for (const auto &Edge : Edges) {
412 AllocType |= Edge->AllocTypes;
413 // Bail early if alloc type reached both, no further refinement.
414 if (AllocType == BothTypes)
415 return AllocType;
416 }
417 return AllocType;
418 }
419
420 // The context ids set for this node is empty if its edge context ids are
421 // also all empty.
422 bool emptyContextIds() const {
424 CalleeEdges, useCallerEdgesForContextInfo()
425 ? CallerEdges
426 : std::vector<std::shared_ptr<ContextEdge>>());
427 for (const auto &Edge : Edges) {
428 if (!Edge->getContextIds().empty())
429 return false;
430 }
431 return true;
432 }
433
434 // List of clones of this ContextNode, initially empty.
435 std::vector<ContextNode *> Clones;
436
437 // If a clone, points to the original uncloned node.
438 ContextNode *CloneOf = nullptr;
439
440 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
441
442 ContextNode(bool IsAllocation, CallInfo C)
443 : IsAllocation(IsAllocation), Call(C) {}
444
445 void addClone(ContextNode *Clone) {
446 if (CloneOf) {
447 CloneOf->Clones.push_back(Clone);
448 Clone->CloneOf = CloneOf;
449 } else {
450 Clones.push_back(Clone);
451 assert(!Clone->CloneOf);
452 Clone->CloneOf = this;
453 }
454 }
455
456 ContextNode *getOrigNode() {
457 if (!CloneOf)
458 return this;
459 return CloneOf;
460 }
461
462 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
463 unsigned int ContextId);
464
465 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
466 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
467 void eraseCalleeEdge(const ContextEdge *Edge);
468 void eraseCallerEdge(const ContextEdge *Edge);
469
470 void setCall(CallInfo C) { Call = C; }
471
472 bool hasCall() const { return (bool)Call.call(); }
473
474 void printCall(raw_ostream &OS) const { Call.print(OS); }
475
476 // True if this node was effectively removed from the graph, in which case
477 // it should have an allocation type of None and empty context ids.
478 bool isRemoved() const {
479 // Typically if the callee edges are empty either the caller edges are
480 // also empty, or this is an allocation (leaf node). However, if we are
481 // allowing recursive callsites and contexts this will be violated for
482 // incompletely cloned recursive cycles.
484 (AllocTypes == (uint8_t)AllocationType::None) ==
485 emptyContextIds());
486 return AllocTypes == (uint8_t)AllocationType::None;
487 }
488
489 void dump() const;
490 void print(raw_ostream &OS) const;
491
492 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
493 Node.print(OS);
494 return OS;
495 }
496 };
497
498 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
499 /// callee.
500 struct ContextEdge {
501 ContextNode *Callee;
502 ContextNode *Caller;
503
504 // This will be formed by ORing together the AllocationType enum values
505 // for contexts including this edge.
506 uint8_t AllocTypes = 0;
507
508 // Set just before initiating cloning when cloning of recursive contexts is
509 // enabled. Used to defer cloning of backedges until we have done cloning of
510 // the callee node for non-backedge caller edges. This exposes cloning
511 // opportunities through the backedge of the cycle.
512 // TODO: Note that this is not updated during cloning, and it is unclear
513 // whether that would be needed.
514 bool IsBackedge = false;
515
516 // The set of IDs for contexts including this edge.
517 DenseSet<uint32_t> ContextIds;
518
519 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
520 DenseSet<uint32_t> ContextIds)
521 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
522 ContextIds(std::move(ContextIds)) {}
523
524 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
525
526 // Helper to clear the fields of this edge when we are removing it from the
527 // graph.
528 inline void clear() {
529 ContextIds.clear();
530 AllocTypes = (uint8_t)AllocationType::None;
531 Caller = nullptr;
532 Callee = nullptr;
533 }
534
535 // Check if edge was removed from the graph. This is useful while iterating
536 // over a copy of edge lists when performing operations that mutate the
537 // graph in ways that might remove one of the edges.
538 inline bool isRemoved() const {
539 if (Callee || Caller)
540 return false;
541 // Any edges that have been removed from the graph but are still in a
542 // shared_ptr somewhere should have all fields null'ed out by clear()
543 // above.
544 assert(AllocTypes == (uint8_t)AllocationType::None);
545 assert(ContextIds.empty());
546 return true;
547 }
548
549 void dump() const;
550 void print(raw_ostream &OS) const;
551
552 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
553 Edge.print(OS);
554 return OS;
555 }
556 };
557
558 /// Helpers to remove edges that have allocation type None (due to not
559 /// carrying any context ids) after transformations.
560 void removeNoneTypeCalleeEdges(ContextNode *Node);
561 void removeNoneTypeCallerEdges(ContextNode *Node);
562 void
563 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
565
566protected:
567 /// Get a list of nodes corresponding to the stack ids in the given callsite
568 /// context.
569 template <class NodeT, class IteratorT>
570 std::vector<uint64_t>
571 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
572
573 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
574 /// metadata (or summary).
575 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
576
577 /// Adds nodes for the given MIB stack ids.
578 template <class NodeT, class IteratorT>
579 void addStackNodesForMIB(ContextNode *AllocNode,
580 CallStack<NodeT, IteratorT> &StackContext,
581 CallStack<NodeT, IteratorT> &CallsiteContext,
583 ArrayRef<ContextTotalSize> ContextSizeInfo);
584
585 /// Matches all callsite metadata (or summary) to the nodes created for
586 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
587 /// inlining performed on those callsite instructions.
588 void updateStackNodes();
589
590 /// Update graph to conservatively handle any callsite stack nodes that target
591 /// multiple different callee target functions.
592 void handleCallsitesWithMultipleTargets();
593
594 /// Mark backedges via the standard DFS based backedge algorithm.
595 void markBackedges();
596
597 /// Merge clones generated during cloning for different allocations but that
598 /// are called by the same caller node, to ensure proper function assignment.
599 void mergeClones();
600
601 // Try to partition calls on the given node (already placed into the AllCalls
602 // array) by callee function, creating new copies of Node as needed to hold
603 // calls with different callees, and moving the callee edges appropriately.
604 // Returns true if partitioning was successful.
605 bool partitionCallsByCallee(
606 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
607 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
608
609 /// Save lists of calls with MemProf metadata in each function, for faster
610 /// iteration.
611 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
612
613 /// Map from callsite node to the enclosing caller function.
614 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
615
616 // When exporting to dot, and an allocation id is specified, contains the
617 // context ids on that allocation.
618 DenseSet<uint32_t> DotAllocContextIds;
619
620private:
621 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
622
623 // Structure to keep track of information for each call as we are matching
624 // non-allocation callsites onto context nodes created from the allocation
625 // call metadata / summary contexts.
626 struct CallContextInfo {
627 // The callsite we're trying to match.
628 CallTy Call;
629 // The callsites stack ids that have a context node in the graph.
630 std::vector<uint64_t> StackIds;
631 // The function containing this callsite.
632 const FuncTy *Func;
633 // Initially empty, if needed this will be updated to contain the context
634 // ids for use in a new context node created for this callsite.
635 DenseSet<uint32_t> ContextIds;
636 };
637
638 /// Helper to remove edge from graph, updating edge iterator if it is provided
639 /// (in which case CalleeIter indicates which edge list is being iterated).
640 /// This will also perform the necessary clearing of the ContextEdge members
641 /// to enable later checking if the edge has been removed (since we may have
642 /// other copies of the shared_ptr in existence, and in fact rely on this to
643 /// enable removal while iterating over a copy of a node's edge list).
644 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
645 bool CalleeIter = true);
646
647 /// Assigns the given Node to calls at or inlined into the location with
648 /// the Node's stack id, after post order traversing and processing its
649 /// caller nodes. Uses the call information recorded in the given
650 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
651 /// as needed. Called by updateStackNodes which sets up the given
652 /// StackIdToMatchingCalls map.
653 void assignStackNodesPostOrder(
654 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
655 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
656 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
657
658 /// Duplicates the given set of context ids, updating the provided
659 /// map from each original id with the newly generated context ids,
660 /// and returning the new duplicated id set.
661 DenseSet<uint32_t> duplicateContextIds(
662 const DenseSet<uint32_t> &StackSequenceContextIds,
663 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
664
665 /// Propagates all duplicated context ids across the graph.
666 void propagateDuplicateContextIds(
667 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
668
669 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
670 /// else to its callers. Also updates OrigNode's edges to remove any context
671 /// ids moved to the newly created edge.
672 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
673 bool TowardsCallee,
674 DenseSet<uint32_t> RemainingContextIds);
675
676 /// Get the stack id corresponding to the given Id or Index (for IR this will
677 /// return itself, for a summary index this will return the id recorded in the
678 /// index for that stack id index value).
679 uint64_t getStackId(uint64_t IdOrIndex) const {
680 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
681 }
682
683 /// Returns true if the given call targets the callee of the given edge, or if
684 /// we were able to identify the call chain through intermediate tail calls.
685 /// In the latter case new context nodes are added to the graph for the
686 /// identified tail calls, and their synthesized nodes are added to
687 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
688 /// the updated edges and to prepare it for an increment in the caller.
689 bool
690 calleesMatch(CallTy Call, EdgeIter &EI,
691 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
692
693 // Return the callee function of the given call, or nullptr if it can't be
694 // determined
695 const FuncTy *getCalleeFunc(CallTy Call) {
696 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
697 }
698
699 /// Returns true if the given call targets the given function, or if we were
700 /// able to identify the call chain through intermediate tail calls (in which
701 /// case FoundCalleeChain will be populated).
702 bool calleeMatchesFunc(
703 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
704 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
705 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
706 Call, Func, CallerFunc, FoundCalleeChain);
707 }
708
709 /// Returns true if both call instructions have the same callee.
710 bool sameCallee(CallTy Call1, CallTy Call2) {
711 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
712 }
713
714 /// Get a list of nodes corresponding to the stack ids in the given
715 /// callsite's context.
716 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
717 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
718 Call);
719 }
720
721 /// Get the last stack id in the context for callsite.
722 uint64_t getLastStackId(CallTy Call) {
723 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
724 }
725
726 /// Update the allocation call to record type of allocated memory.
727 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
728 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
729 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
730 }
731
732 /// Get the AllocationType assigned to the given allocation instruction clone.
733 AllocationType getAllocationCallType(const CallInfo &Call) const {
734 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
735 }
736
737 /// Update non-allocation call to invoke (possibly cloned) function
738 /// CalleeFunc.
739 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
740 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
741 }
742
743 /// Clone the given function for the given callsite, recording mapping of all
744 /// of the functions tracked calls to their new versions in the CallMap.
745 /// Assigns new clones to clone number CloneNo.
746 FuncInfo cloneFunctionForCallsite(
747 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
748 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
749 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
750 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
751 }
752
753 /// Gets a label to use in the dot graph for the given call clone in the given
754 /// function.
755 std::string getLabel(const FuncTy *Func, const CallTy Call,
756 unsigned CloneNo) const {
757 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
758 }
759
760 // Create and return a new ContextNode.
761 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
762 CallInfo C = CallInfo()) {
763 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
764 auto *NewNode = NodeOwner.back().get();
765 if (F)
766 NodeToCallingFunc[NewNode] = F;
767 NewNode->NodeId = NodeOwner.size();
768 return NewNode;
769 }
770
771 /// Helpers to find the node corresponding to the given call or stackid.
772 ContextNode *getNodeForInst(const CallInfo &C);
773 ContextNode *getNodeForAlloc(const CallInfo &C);
774 ContextNode *getNodeForStackId(uint64_t StackId);
775
776 /// Computes the alloc type corresponding to the given context ids, by
777 /// unioning their recorded alloc types.
778 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
779
780 /// Returns the allocation type of the intersection of the contexts of two
781 /// nodes (based on their provided context id sets), optimized for the case
782 /// when Node1Ids is smaller than Node2Ids.
783 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
784 const DenseSet<uint32_t> &Node2Ids) const;
785
786 /// Returns the allocation type of the intersection of the contexts of two
787 /// nodes (based on their provided context id sets).
788 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
789 const DenseSet<uint32_t> &Node2Ids) const;
790
791 /// Create a clone of Edge's callee and move Edge to that new callee node,
792 /// performing the necessary context id and allocation type updates.
793 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
794 /// moved to an edge to the new callee.
795 ContextNode *
796 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
797 DenseSet<uint32_t> ContextIdsToMove = {});
798
799 /// Change the callee of Edge to existing callee clone NewCallee, performing
800 /// the necessary context id and allocation type updates.
801 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
802 /// moved to an edge to the new callee.
803 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
804 ContextNode *NewCallee,
805 bool NewClone = false,
806 DenseSet<uint32_t> ContextIdsToMove = {});
807
808 /// Change the caller of the edge at the given callee edge iterator to be
809 /// NewCaller, performing the necessary context id and allocation type
810 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
811 /// a simplified version of it as we always move the given edge and all of its
812 /// context ids.
813 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
814 ContextNode *NewCaller);
815
816 /// Recursive helper for marking backedges via DFS.
817 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
818 DenseSet<const ContextNode *> &CurrentStack);
819
820 /// Recursive helper for merging clones.
821 void
822 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
823 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
824 /// Main worker for merging callee clones for a given node.
825 void mergeNodeCalleeClones(
826 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
827 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
828 /// Helper to find other callers of the given set of callee edges that can
829 /// share the same callee merge node.
830 void findOtherCallersToShareMerge(
831 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
832 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
833 DenseSet<ContextNode *> &OtherCallersToShareMerge);
834
835 /// Recursively perform cloning on the graph for the given Node and its
836 /// callers, in order to uniquely identify the allocation behavior of an
837 /// allocation given its context. The context ids of the allocation being
838 /// processed are given in AllocContextIds.
839 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
840 const DenseSet<uint32_t> &AllocContextIds);
841
842 /// Map from each context ID to the AllocationType assigned to that context.
843 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
844
845 /// Map from each contextID to the profiled full contexts and their total
846 /// sizes (there may be more than one due to context trimming),
847 /// optionally populated when requested (via MemProfReportHintedSizes or
848 /// MinClonedColdBytePercent).
849 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
850
851 /// Identifies the context node created for a stack id when adding the MIB
852 /// contexts to the graph. This is used to locate the context nodes when
853 /// trying to assign the corresponding callsites with those stack ids to these
854 /// nodes.
855 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
856
857 /// Maps to track the calls to their corresponding nodes in the graph.
858 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
859 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
860
861 /// Owner of all ContextNode unique_ptrs.
862 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
863
864 /// Perform sanity checks on graph when requested.
865 void check() const;
866
867 /// Keeps track of the last unique context id assigned.
868 unsigned int LastContextId = 0;
869};
870
871template <typename DerivedCCG, typename FuncTy, typename CallTy>
872using ContextNode =
873 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
874template <typename DerivedCCG, typename FuncTy, typename CallTy>
875using ContextEdge =
876 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
877template <typename DerivedCCG, typename FuncTy, typename CallTy>
878using FuncInfo =
879 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
880template <typename DerivedCCG, typename FuncTy, typename CallTy>
881using CallInfo =
882 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
883
884/// CRTP derived class for graphs built from IR (regular LTO).
885class ModuleCallsiteContextGraph
886 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
887 Instruction *> {
888public:
889 ModuleCallsiteContextGraph(
890 Module &M,
891 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
892
893private:
894 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
895 Instruction *>;
896
897 uint64_t getStackId(uint64_t IdOrIndex) const;
898 const Function *getCalleeFunc(Instruction *Call);
899 bool calleeMatchesFunc(
900 Instruction *Call, const Function *Func, const Function *CallerFunc,
901 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
902 bool sameCallee(Instruction *Call1, Instruction *Call2);
903 bool findProfiledCalleeThroughTailCalls(
904 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
905 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
906 bool &FoundMultipleCalleeChains);
907 uint64_t getLastStackId(Instruction *Call);
908 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
909 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
910 AllocationType getAllocationCallType(const CallInfo &Call) const;
911 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
912 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
913 Instruction *>::FuncInfo
914 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
915 DenseMap<CallInfo, CallInfo> &CallMap,
916 std::vector<CallInfo> &CallsWithMetadataInFunc,
917 unsigned CloneNo);
918 std::string getLabel(const Function *Func, const Instruction *Call,
919 unsigned CloneNo) const;
920
921 const Module &Mod;
922 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
923};
924
925/// Represents a call in the summary index graph, which can either be an
926/// allocation or an interior callsite node in an allocation's context.
927/// Holds a pointer to the corresponding data structure in the index.
928struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
929 IndexCall() : PointerUnion() {}
930 IndexCall(std::nullptr_t) : IndexCall() {}
931 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
932 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
933 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
934
935 IndexCall *operator->() { return this; }
936
937 void print(raw_ostream &OS) const {
938 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
940 OS << *AI;
941 } else {
943 assert(CI);
944 OS << *CI;
945 }
946 }
947};
948} // namespace
949
950namespace llvm {
951template <> struct simplify_type<IndexCall> {
953 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
954};
955template <> struct simplify_type<const IndexCall> {
957 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
958};
959} // namespace llvm
960
961namespace {
962/// CRTP derived class for graphs built from summary index (ThinLTO).
963class IndexCallsiteContextGraph
964 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
965 IndexCall> {
966public:
967 IndexCallsiteContextGraph(
968 ModuleSummaryIndex &Index,
969 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
970 isPrevailing);
971
972 ~IndexCallsiteContextGraph() {
973 // Now that we are done with the graph it is safe to add the new
974 // CallsiteInfo structs to the function summary vectors. The graph nodes
975 // point into locations within these vectors, so we don't want to add them
976 // any earlier.
977 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
978 auto *FS = I.first;
979 for (auto &Callsite : I.second)
980 FS->addCallsite(*Callsite.second);
981 }
982 }
983
984private:
985 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
986 IndexCall>;
987
988 uint64_t getStackId(uint64_t IdOrIndex) const;
989 const FunctionSummary *getCalleeFunc(IndexCall &Call);
990 bool calleeMatchesFunc(
991 IndexCall &Call, const FunctionSummary *Func,
992 const FunctionSummary *CallerFunc,
993 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
994 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
995 bool findProfiledCalleeThroughTailCalls(
996 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
997 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
998 bool &FoundMultipleCalleeChains);
999 uint64_t getLastStackId(IndexCall &Call);
1000 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1001 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1002 AllocationType getAllocationCallType(const CallInfo &Call) const;
1003 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1004 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1005 IndexCall>::FuncInfo
1006 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1007 DenseMap<CallInfo, CallInfo> &CallMap,
1008 std::vector<CallInfo> &CallsWithMetadataInFunc,
1009 unsigned CloneNo);
1010 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1011 unsigned CloneNo) const;
1012
1013 // Saves mapping from function summaries containing memprof records back to
1014 // its VI, for use in checking and debugging.
1015 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1016
1017 const ModuleSummaryIndex &Index;
1018 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1019 isPrevailing;
1020
1021 // Saves/owns the callsite info structures synthesized for missing tail call
1022 // frames that we discover while building the graph.
1023 // It maps from the summary of the function making the tail call, to a map
1024 // of callee ValueInfo to corresponding synthesized callsite info.
1025 std::unordered_map<FunctionSummary *,
1026 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1027 FunctionCalleesToSynthesizedCallsiteInfos;
1028};
1029} // namespace
1030
1031namespace llvm {
1032template <>
1033struct DenseMapInfo<typename CallsiteContextGraph<
1034 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1036template <>
1037struct DenseMapInfo<typename CallsiteContextGraph<
1038 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1039 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1040template <>
1041struct DenseMapInfo<IndexCall>
1042 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1043} // end namespace llvm
1044
1045namespace {
1046
1047// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1048// type we should actually use on the corresponding allocation.
1049// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1050// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1051// from NotCold.
1052AllocationType allocTypeToUse(uint8_t AllocTypes) {
1053 assert(AllocTypes != (uint8_t)AllocationType::None);
1054 if (AllocTypes ==
1057 else
1058 return (AllocationType)AllocTypes;
1059}
1060
1061// Helper to check if the alloc types for all edges recorded in the
1062// InAllocTypes vector match the alloc types for all edges in the Edges
1063// vector.
1064template <typename DerivedCCG, typename FuncTy, typename CallTy>
1065bool allocTypesMatch(
1066 const std::vector<uint8_t> &InAllocTypes,
1067 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1068 &Edges) {
1069 // This should be called only when the InAllocTypes vector was computed for
1070 // this set of Edges. Make sure the sizes are the same.
1071 assert(InAllocTypes.size() == Edges.size());
1072 return std::equal(
1073 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1074 [](const uint8_t &l,
1075 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1076 // Can share if one of the edges is None type - don't
1077 // care about the type along that edge as it doesn't
1078 // exist for those context ids.
1079 if (l == (uint8_t)AllocationType::None ||
1080 r->AllocTypes == (uint8_t)AllocationType::None)
1081 return true;
1082 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1083 });
1084}
1085
1086// Helper to check if the alloc types for all edges recorded in the
1087// InAllocTypes vector match the alloc types for callee edges in the given
1088// clone. Because the InAllocTypes were computed from the original node's callee
1089// edges, and other cloning could have happened after this clone was created, we
1090// need to find the matching clone callee edge, which may or may not exist.
1091template <typename DerivedCCG, typename FuncTy, typename CallTy>
1092bool allocTypesMatchClone(
1093 const std::vector<uint8_t> &InAllocTypes,
1094 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1095 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1096 assert(Node);
1097 // InAllocTypes should have been computed for the original node's callee
1098 // edges.
1099 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1100 // First create a map of the clone callee edge callees to the edge alloc type.
1102 EdgeCalleeMap;
1103 for (const auto &E : Clone->CalleeEdges) {
1104 assert(!EdgeCalleeMap.contains(E->Callee));
1105 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1106 }
1107 // Next, walk the original node's callees, and look for the corresponding
1108 // clone edge to that callee.
1109 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1110 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1111 // Not found is ok, we will simply add an edge if we use this clone.
1112 if (Iter == EdgeCalleeMap.end())
1113 continue;
1114 // Can share if one of the edges is None type - don't
1115 // care about the type along that edge as it doesn't
1116 // exist for those context ids.
1117 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1118 Iter->second == (uint8_t)AllocationType::None)
1119 continue;
1120 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1121 return false;
1122 }
1123 return true;
1124}
1125
1126} // end anonymous namespace
1127
1128template <typename DerivedCCG, typename FuncTy, typename CallTy>
1129typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1130CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1131 const CallInfo &C) {
1132 ContextNode *Node = getNodeForAlloc(C);
1133 if (Node)
1134 return Node;
1135
1136 return NonAllocationCallToContextNodeMap.lookup(C);
1137}
1138
1139template <typename DerivedCCG, typename FuncTy, typename CallTy>
1140typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1141CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1142 const CallInfo &C) {
1143 return AllocationCallToContextNodeMap.lookup(C);
1144}
1145
1146template <typename DerivedCCG, typename FuncTy, typename CallTy>
1147typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1148CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1149 uint64_t StackId) {
1150 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1151 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1152 return StackEntryNode->second;
1153 return nullptr;
1154}
1155
1156template <typename DerivedCCG, typename FuncTy, typename CallTy>
1157void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1158 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1159 unsigned int ContextId) {
1160 for (auto &Edge : CallerEdges) {
1161 if (Edge->Caller == Caller) {
1162 Edge->AllocTypes |= (uint8_t)AllocType;
1163 Edge->getContextIds().insert(ContextId);
1164 return;
1165 }
1166 }
1167 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1168 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1169 CallerEdges.push_back(Edge);
1170 Caller->CalleeEdges.push_back(Edge);
1171}
1172
1173template <typename DerivedCCG, typename FuncTy, typename CallTy>
1174void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1175 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1176 assert(!EI || (*EI)->get() == Edge);
1177 assert(!Edge->isRemoved());
1178 // Save the Caller and Callee pointers so we can erase Edge from their edge
1179 // lists after clearing Edge below. We do the clearing first in case it is
1180 // destructed after removing from the edge lists (if those were the last
1181 // shared_ptr references to Edge).
1182 auto *Callee = Edge->Callee;
1183 auto *Caller = Edge->Caller;
1184
1185 // Make sure the edge fields are cleared out so we can properly detect
1186 // removed edges if Edge is not destructed because there is still a shared_ptr
1187 // reference.
1188 Edge->clear();
1189
1190#ifndef NDEBUG
1191 auto CalleeCallerCount = Callee->CallerEdges.size();
1192 auto CallerCalleeCount = Caller->CalleeEdges.size();
1193#endif
1194 if (!EI) {
1195 Callee->eraseCallerEdge(Edge);
1196 Caller->eraseCalleeEdge(Edge);
1197 } else if (CalleeIter) {
1198 Callee->eraseCallerEdge(Edge);
1199 *EI = Caller->CalleeEdges.erase(*EI);
1200 } else {
1201 Caller->eraseCalleeEdge(Edge);
1202 *EI = Callee->CallerEdges.erase(*EI);
1203 }
1204 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1205 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1206}
1207
1208template <typename DerivedCCG, typename FuncTy, typename CallTy>
1209void CallsiteContextGraph<
1210 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1211 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1212 auto Edge = *EI;
1213 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1214 assert(Edge->ContextIds.empty());
1215 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1216 } else
1217 ++EI;
1218 }
1219}
1220
1221template <typename DerivedCCG, typename FuncTy, typename CallTy>
1222void CallsiteContextGraph<
1223 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1224 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1225 auto Edge = *EI;
1226 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1227 assert(Edge->ContextIds.empty());
1228 Edge->Caller->eraseCalleeEdge(Edge.get());
1229 EI = Node->CallerEdges.erase(EI);
1230 } else
1231 ++EI;
1232 }
1233}
1234
1235template <typename DerivedCCG, typename FuncTy, typename CallTy>
1236typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1237CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1238 findEdgeFromCallee(const ContextNode *Callee) {
1239 for (const auto &Edge : CalleeEdges)
1240 if (Edge->Callee == Callee)
1241 return Edge.get();
1242 return nullptr;
1243}
1244
1245template <typename DerivedCCG, typename FuncTy, typename CallTy>
1246typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1247CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1248 findEdgeFromCaller(const ContextNode *Caller) {
1249 for (const auto &Edge : CallerEdges)
1250 if (Edge->Caller == Caller)
1251 return Edge.get();
1252 return nullptr;
1253}
1254
1255template <typename DerivedCCG, typename FuncTy, typename CallTy>
1256void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1257 eraseCalleeEdge(const ContextEdge *Edge) {
1258 auto EI = llvm::find_if(
1259 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1260 return CalleeEdge.get() == Edge;
1261 });
1262 assert(EI != CalleeEdges.end());
1263 CalleeEdges.erase(EI);
1264}
1265
1266template <typename DerivedCCG, typename FuncTy, typename CallTy>
1267void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1268 eraseCallerEdge(const ContextEdge *Edge) {
1269 auto EI = llvm::find_if(
1270 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1271 return CallerEdge.get() == Edge;
1272 });
1273 assert(EI != CallerEdges.end());
1274 CallerEdges.erase(EI);
1275}
1276
1277template <typename DerivedCCG, typename FuncTy, typename CallTy>
1278uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1279 DenseSet<uint32_t> &ContextIds) const {
1280 uint8_t BothTypes =
1281 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1282 uint8_t AllocType = (uint8_t)AllocationType::None;
1283 for (auto Id : ContextIds) {
1284 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1285 // Bail early if alloc type reached both, no further refinement.
1286 if (AllocType == BothTypes)
1287 return AllocType;
1288 }
1289 return AllocType;
1290}
1291
1292template <typename DerivedCCG, typename FuncTy, typename CallTy>
1293uint8_t
1294CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1295 const DenseSet<uint32_t> &Node1Ids,
1296 const DenseSet<uint32_t> &Node2Ids) const {
1297 uint8_t BothTypes =
1298 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1299 uint8_t AllocType = (uint8_t)AllocationType::None;
1300 for (auto Id : Node1Ids) {
1301 if (!Node2Ids.count(Id))
1302 continue;
1303 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1304 // Bail early if alloc type reached both, no further refinement.
1305 if (AllocType == BothTypes)
1306 return AllocType;
1307 }
1308 return AllocType;
1309}
1310
1311template <typename DerivedCCG, typename FuncTy, typename CallTy>
1312uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1313 const DenseSet<uint32_t> &Node1Ids,
1314 const DenseSet<uint32_t> &Node2Ids) const {
1315 if (Node1Ids.size() < Node2Ids.size())
1316 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1317 else
1318 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1319}
1320
1321template <typename DerivedCCG, typename FuncTy, typename CallTy>
1322typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1323CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1324 CallInfo Call, const FuncTy *F) {
1325 assert(!getNodeForAlloc(Call));
1326 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1327 AllocationCallToContextNodeMap[Call] = AllocNode;
1328 // Use LastContextId as a uniq id for MIB allocation nodes.
1329 AllocNode->OrigStackOrAllocId = LastContextId;
1330 // Alloc type should be updated as we add in the MIBs. We should assert
1331 // afterwards that it is not still None.
1332 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1333
1334 return AllocNode;
1335}
1336
1337static std::string getAllocTypeString(uint8_t AllocTypes) {
1338 if (!AllocTypes)
1339 return "None";
1340 std::string Str;
1341 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1342 Str += "NotCold";
1343 if (AllocTypes & (uint8_t)AllocationType::Cold)
1344 Str += "Cold";
1345 return Str;
1346}
1347
1348template <typename DerivedCCG, typename FuncTy, typename CallTy>
1349template <class NodeT, class IteratorT>
1350void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1351 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1352 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1353 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1354 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1355 // is done.
1356 if (AllocType == AllocationType::Hot)
1357 AllocType = AllocationType::NotCold;
1358
1359 ContextIdToAllocationType[++LastContextId] = AllocType;
1360
1361 if (!ContextSizeInfo.empty()) {
1362 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1363 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1364 }
1365
1366 // Update alloc type and context ids for this MIB.
1367 AllocNode->AllocTypes |= (uint8_t)AllocType;
1368
1369 // Now add or update nodes for each stack id in alloc's context.
1370 // Later when processing the stack ids on non-alloc callsites we will adjust
1371 // for any inlining in the context.
1372 ContextNode *PrevNode = AllocNode;
1373 // Look for recursion (direct recursion should have been collapsed by
1374 // module summary analysis, here we should just be detecting mutual
1375 // recursion). Mark these nodes so we don't try to clone.
1376 SmallSet<uint64_t, 8> StackIdSet;
1377 // Skip any on the allocation call (inlining).
1378 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1379 ContextIter != StackContext.end(); ++ContextIter) {
1380 auto StackId = getStackId(*ContextIter);
1381 ContextNode *StackNode = getNodeForStackId(StackId);
1382 if (!StackNode) {
1383 StackNode = createNewNode(/*IsAllocation=*/false);
1384 StackEntryIdToContextNodeMap[StackId] = StackNode;
1385 StackNode->OrigStackOrAllocId = StackId;
1386 }
1387 // Marking a node recursive will prevent its cloning completely, even for
1388 // non-recursive contexts flowing through it.
1390 auto Ins = StackIdSet.insert(StackId);
1391 if (!Ins.second)
1392 StackNode->Recursive = true;
1393 }
1394 StackNode->AllocTypes |= (uint8_t)AllocType;
1395 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1396 PrevNode = StackNode;
1397 }
1398}
1399
1400template <typename DerivedCCG, typename FuncTy, typename CallTy>
1401DenseSet<uint32_t>
1402CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1403 const DenseSet<uint32_t> &StackSequenceContextIds,
1404 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1405 DenseSet<uint32_t> NewContextIds;
1406 for (auto OldId : StackSequenceContextIds) {
1407 NewContextIds.insert(++LastContextId);
1408 OldToNewContextIds[OldId].insert(LastContextId);
1409 assert(ContextIdToAllocationType.count(OldId));
1410 // The new context has the same allocation type as original.
1411 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1412 if (DotAllocContextIds.contains(OldId))
1413 DotAllocContextIds.insert(LastContextId);
1414 }
1415 return NewContextIds;
1416}
1417
1418template <typename DerivedCCG, typename FuncTy, typename CallTy>
1419void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1420 propagateDuplicateContextIds(
1421 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1422 // Build a set of duplicated context ids corresponding to the input id set.
1423 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1424 DenseSet<uint32_t> NewIds;
1425 for (auto Id : ContextIds)
1426 if (auto NewId = OldToNewContextIds.find(Id);
1427 NewId != OldToNewContextIds.end())
1428 NewIds.insert_range(NewId->second);
1429 return NewIds;
1430 };
1431
1432 // Recursively update context ids sets along caller edges.
1433 auto UpdateCallers = [&](ContextNode *Node,
1434 DenseSet<const ContextEdge *> &Visited,
1435 auto &&UpdateCallers) -> void {
1436 for (const auto &Edge : Node->CallerEdges) {
1437 auto Inserted = Visited.insert(Edge.get());
1438 if (!Inserted.second)
1439 continue;
1440 ContextNode *NextNode = Edge->Caller;
1441 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1442 // Only need to recursively iterate to NextNode via this caller edge if
1443 // it resulted in any added ids to NextNode.
1444 if (!NewIdsToAdd.empty()) {
1445 Edge->getContextIds().insert_range(NewIdsToAdd);
1446 UpdateCallers(NextNode, Visited, UpdateCallers);
1447 }
1448 }
1449 };
1450
1451 DenseSet<const ContextEdge *> Visited;
1452 for (auto &Entry : AllocationCallToContextNodeMap) {
1453 auto *Node = Entry.second;
1454 UpdateCallers(Node, Visited, UpdateCallers);
1455 }
1456}
1457
1458template <typename DerivedCCG, typename FuncTy, typename CallTy>
1459void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1460 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1461 // This must be passed by value to make a copy since it will be adjusted
1462 // as ids are moved.
1463 DenseSet<uint32_t> RemainingContextIds) {
1464 auto &OrigEdges =
1465 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1466 DenseSet<uint32_t> RecursiveContextIds;
1467 DenseSet<uint32_t> AllCallerContextIds;
1469 // Identify which context ids are recursive which is needed to properly
1470 // update the RemainingContextIds set. The relevant recursive context ids
1471 // are those that are in multiple edges.
1472 for (auto &CE : OrigEdges) {
1473 AllCallerContextIds.reserve(CE->getContextIds().size());
1474 for (auto Id : CE->getContextIds())
1475 if (!AllCallerContextIds.insert(Id).second)
1476 RecursiveContextIds.insert(Id);
1477 }
1478 }
1479 // Increment iterator in loop so that we can remove edges as needed.
1480 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1481 auto Edge = *EI;
1482 DenseSet<uint32_t> NewEdgeContextIds;
1483 DenseSet<uint32_t> NotFoundContextIds;
1484 // Remove any matching context ids from Edge, return set that were found and
1485 // removed, these are the new edge's context ids. Also update the remaining
1486 // (not found ids).
1487 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1488 NotFoundContextIds);
1489 // Update the remaining context ids set for the later edges. This is a
1490 // compile time optimization.
1491 if (RecursiveContextIds.empty()) {
1492 // No recursive ids, so all of the previously remaining context ids that
1493 // were not seen on this edge are the new remaining set.
1494 RemainingContextIds.swap(NotFoundContextIds);
1495 } else {
1496 // Keep the recursive ids in the remaining set as we expect to see those
1497 // on another edge. We can remove the non-recursive remaining ids that
1498 // were seen on this edge, however. We already have the set of remaining
1499 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1500 // non-recursive and only remove those. Note that despite the higher
1501 // overhead of updating the remaining context ids set when recursion
1502 // handling is enabled, it was found to be at worst performance neutral
1503 // and in one case a clear win.
1504 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1505 set_difference(NewEdgeContextIds, RecursiveContextIds);
1506 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1507 }
1508 // If no matching context ids for this edge, skip it.
1509 if (NewEdgeContextIds.empty()) {
1510 ++EI;
1511 continue;
1512 }
1513 if (TowardsCallee) {
1514 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1515 auto NewEdge = std::make_shared<ContextEdge>(
1516 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1517 NewNode->CalleeEdges.push_back(NewEdge);
1518 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1519 } else {
1520 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1521 auto NewEdge = std::make_shared<ContextEdge>(
1522 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1523 NewNode->CallerEdges.push_back(NewEdge);
1524 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1525 }
1526 // Remove old edge if context ids empty.
1527 if (Edge->getContextIds().empty()) {
1528 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1529 continue;
1530 }
1531 ++EI;
1532 }
1533}
1534
1535template <typename DerivedCCG, typename FuncTy, typename CallTy>
1536static void checkEdge(
1537 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1538 // Confirm that alloc type is not None and that we have at least one context
1539 // id.
1540 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1541 assert(!Edge->ContextIds.empty());
1542}
1543
1544template <typename DerivedCCG, typename FuncTy, typename CallTy>
1545static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1546 bool CheckEdges = true) {
1547 if (Node->isRemoved())
1548 return;
1549#ifndef NDEBUG
1550 // Compute node's context ids once for use in asserts.
1551 auto NodeContextIds = Node->getContextIds();
1552#endif
1553 // Node's context ids should be the union of both its callee and caller edge
1554 // context ids.
1555 if (Node->CallerEdges.size()) {
1556 DenseSet<uint32_t> CallerEdgeContextIds(
1557 Node->CallerEdges.front()->ContextIds);
1558 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1559 if (CheckEdges)
1561 set_union(CallerEdgeContextIds, Edge->ContextIds);
1562 }
1563 // Node can have more context ids than callers if some contexts terminate at
1564 // node and some are longer. If we are allowing recursive callsites and
1565 // contexts this will be violated for incompletely cloned recursive cycles,
1566 // so skip the checking in that case.
1568 NodeContextIds == CallerEdgeContextIds ||
1569 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1570 }
1571 if (Node->CalleeEdges.size()) {
1572 DenseSet<uint32_t> CalleeEdgeContextIds(
1573 Node->CalleeEdges.front()->ContextIds);
1574 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1575 if (CheckEdges)
1577 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1578 }
1579 // If we are allowing recursive callsites and contexts this will be violated
1580 // for incompletely cloned recursive cycles, so skip the checking in that
1581 // case.
1583 NodeContextIds == CalleeEdgeContextIds);
1584 }
1585 // FIXME: Since this checking is only invoked under an option, we should
1586 // change the error checking from using assert to something that will trigger
1587 // an error on a release build.
1588#ifndef NDEBUG
1589 // Make sure we don't end up with duplicate edges between the same caller and
1590 // callee.
1592 for (const auto &E : Node->CalleeEdges)
1593 NodeSet.insert(E->Callee);
1594 assert(NodeSet.size() == Node->CalleeEdges.size());
1595#endif
1596}
1597
1598template <typename DerivedCCG, typename FuncTy, typename CallTy>
1599void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1600 assignStackNodesPostOrder(
1601 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1602 DenseMap<uint64_t, std::vector<CallContextInfo>>
1603 &StackIdToMatchingCalls,
1604 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1605 auto Inserted = Visited.insert(Node);
1606 if (!Inserted.second)
1607 return;
1608 // Post order traversal. Iterate over a copy since we may add nodes and
1609 // therefore new callers during the recursive call, invalidating any
1610 // iterator over the original edge vector. We don't need to process these
1611 // new nodes as they were already processed on creation.
1612 auto CallerEdges = Node->CallerEdges;
1613 for (auto &Edge : CallerEdges) {
1614 // Skip any that have been removed during the recursion.
1615 if (Edge->isRemoved()) {
1616 assert(!is_contained(Node->CallerEdges, Edge));
1617 continue;
1618 }
1619 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1620 CallToMatchingCall);
1621 }
1622
1623 // If this node's stack id is in the map, update the graph to contain new
1624 // nodes representing any inlining at interior callsites. Note we move the
1625 // associated context ids over to the new nodes.
1626
1627 // Ignore this node if it is for an allocation or we didn't record any
1628 // stack id lists ending at it.
1629 if (Node->IsAllocation ||
1630 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1631 return;
1632
1633 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1634 // Handle the simple case first. A single call with a single stack id.
1635 // In this case there is no need to create any new context nodes, simply
1636 // assign the context node for stack id to this Call.
1637 if (Calls.size() == 1) {
1638 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1639 if (Ids.size() == 1) {
1640 assert(SavedContextIds.empty());
1641 // It should be this Node
1642 assert(Node == getNodeForStackId(Ids[0]));
1643 if (Node->Recursive)
1644 return;
1645 Node->setCall(Call);
1646 NonAllocationCallToContextNodeMap[Call] = Node;
1647 NodeToCallingFunc[Node] = Func;
1648 return;
1649 }
1650 }
1651
1652#ifndef NDEBUG
1653 // Find the node for the last stack id, which should be the same
1654 // across all calls recorded for this id, and is this node's id.
1655 uint64_t LastId = Node->OrigStackOrAllocId;
1656 ContextNode *LastNode = getNodeForStackId(LastId);
1657 // We should only have kept stack ids that had nodes.
1658 assert(LastNode);
1659 assert(LastNode == Node);
1660#else
1661 ContextNode *LastNode = Node;
1662#endif
1663
1664 // Compute the last node's context ids once, as it is shared by all calls in
1665 // this entry.
1666 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1667
1668 [[maybe_unused]] bool PrevIterCreatedNode = false;
1669 bool CreatedNode = false;
1670 for (unsigned I = 0; I < Calls.size();
1671 I++, PrevIterCreatedNode = CreatedNode) {
1672 CreatedNode = false;
1673 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1674 // Skip any for which we didn't assign any ids, these don't get a node in
1675 // the graph.
1676 if (SavedContextIds.empty()) {
1677 // If this call has a matching call (located in the same function and
1678 // having the same stack ids), simply add it to the context node created
1679 // for its matching call earlier. These can be treated the same through
1680 // cloning and get updated at the same time.
1681 if (!CallToMatchingCall.contains(Call))
1682 continue;
1683 auto MatchingCall = CallToMatchingCall[Call];
1684 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1685 // This should only happen if we had a prior iteration, and it didn't
1686 // create a node because of the below recomputation of context ids
1687 // finding none remaining and continuing early.
1688 assert(I > 0 && !PrevIterCreatedNode);
1689 continue;
1690 }
1691 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1692 Call);
1693 continue;
1694 }
1695
1696 assert(LastId == Ids.back());
1697
1698 // Recompute the context ids for this stack id sequence (the
1699 // intersection of the context ids of the corresponding nodes).
1700 // Start with the ids we saved in the map for this call, which could be
1701 // duplicated context ids. We have to recompute as we might have overlap
1702 // overlap between the saved context ids for different last nodes, and
1703 // removed them already during the post order traversal.
1704 set_intersect(SavedContextIds, LastNodeContextIds);
1705 ContextNode *PrevNode = LastNode;
1706 bool Skip = false;
1707 // Iterate backwards through the stack Ids, starting after the last Id
1708 // in the list, which was handled once outside for all Calls.
1709 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1710 auto Id = *IdIter;
1711 ContextNode *CurNode = getNodeForStackId(Id);
1712 // We should only have kept stack ids that had nodes and weren't
1713 // recursive.
1714 assert(CurNode);
1715 assert(!CurNode->Recursive);
1716
1717 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1718 if (!Edge) {
1719 Skip = true;
1720 break;
1721 }
1722 PrevNode = CurNode;
1723
1724 // Update the context ids, which is the intersection of the ids along
1725 // all edges in the sequence.
1726 set_intersect(SavedContextIds, Edge->getContextIds());
1727
1728 // If we now have no context ids for clone, skip this call.
1729 if (SavedContextIds.empty()) {
1730 Skip = true;
1731 break;
1732 }
1733 }
1734 if (Skip)
1735 continue;
1736
1737 // Create new context node.
1738 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1739 NonAllocationCallToContextNodeMap[Call] = NewNode;
1740 CreatedNode = true;
1741 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1742
1743 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1744 assert(FirstNode);
1745
1746 // Connect to callees of innermost stack frame in inlined call chain.
1747 // This updates context ids for FirstNode's callee's to reflect those
1748 // moved to NewNode.
1749 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1750
1751 // Connect to callers of outermost stack frame in inlined call chain.
1752 // This updates context ids for FirstNode's caller's to reflect those
1753 // moved to NewNode.
1754 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1755
1756 // Now we need to remove context ids from edges/nodes between First and
1757 // Last Node.
1758 PrevNode = nullptr;
1759 for (auto Id : Ids) {
1760 ContextNode *CurNode = getNodeForStackId(Id);
1761 // We should only have kept stack ids that had nodes.
1762 assert(CurNode);
1763
1764 // Remove the context ids moved to NewNode from CurNode, and the
1765 // edge from the prior node.
1766 if (PrevNode) {
1767 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1768 // If the sequence contained recursion, we might have already removed
1769 // some edges during the connectNewNode calls above.
1770 if (!PrevEdge) {
1771 PrevNode = CurNode;
1772 continue;
1773 }
1774 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1775 if (PrevEdge->getContextIds().empty())
1776 removeEdgeFromGraph(PrevEdge);
1777 }
1778 // Since we update the edges from leaf to tail, only look at the callee
1779 // edges. This isn't an alloc node, so if there are no callee edges, the
1780 // alloc type is None.
1781 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1782 ? (uint8_t)AllocationType::None
1783 : CurNode->computeAllocType();
1784 PrevNode = CurNode;
1785 }
1786 if (VerifyNodes) {
1787 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1788 for (auto Id : Ids) {
1789 ContextNode *CurNode = getNodeForStackId(Id);
1790 // We should only have kept stack ids that had nodes.
1791 assert(CurNode);
1792 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1793 }
1794 }
1795 }
1796}
1797
1798template <typename DerivedCCG, typename FuncTy, typename CallTy>
1799void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1800 // Map of stack id to all calls with that as the last (outermost caller)
1801 // callsite id that has a context node (some might not due to pruning
1802 // performed during matching of the allocation profile contexts).
1803 // The CallContextInfo contains the Call and a list of its stack ids with
1804 // ContextNodes, the function containing Call, and the set of context ids
1805 // the analysis will eventually identify for use in any new node created
1806 // for that callsite.
1807 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1808 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1809 for (auto &Call : CallsWithMetadata) {
1810 // Ignore allocations, already handled.
1811 if (AllocationCallToContextNodeMap.count(Call))
1812 continue;
1813 auto StackIdsWithContextNodes =
1814 getStackIdsWithContextNodesForCall(Call.call());
1815 // If there were no nodes created for MIBs on allocs (maybe this was in
1816 // the unambiguous part of the MIB stack that was pruned), ignore.
1817 if (StackIdsWithContextNodes.empty())
1818 continue;
1819 // Otherwise, record this Call along with the list of ids for the last
1820 // (outermost caller) stack id with a node.
1821 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1822 {Call.call(), StackIdsWithContextNodes, Func, {}});
1823 }
1824 }
1825
1826 // First make a pass through all stack ids that correspond to a call,
1827 // as identified in the above loop. Compute the context ids corresponding to
1828 // each of these calls when they correspond to multiple stack ids due to
1829 // due to inlining. Perform any duplication of context ids required when
1830 // there is more than one call with the same stack ids. Their (possibly newly
1831 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1832 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1833 // Save a map from each call to any that are found to match it. I.e. located
1834 // in the same function and have the same (possibly pruned) stack ids. We use
1835 // this to avoid creating extra graph nodes as they can be treated the same.
1836 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1837 for (auto &It : StackIdToMatchingCalls) {
1838 auto &Calls = It.getSecond();
1839 // Skip single calls with a single stack id. These don't need a new node.
1840 if (Calls.size() == 1) {
1841 auto &Ids = Calls[0].StackIds;
1842 if (Ids.size() == 1)
1843 continue;
1844 }
1845 // In order to do the best and maximal matching of inlined calls to context
1846 // node sequences we will sort the vectors of stack ids in descending order
1847 // of length, and within each length, lexicographically by stack id. The
1848 // latter is so that we can specially handle calls that have identical stack
1849 // id sequences (either due to cloning or artificially because of the MIB
1850 // context pruning). Those with the same Ids are then sorted by function to
1851 // facilitate efficiently mapping them to the same context node.
1852 // Because the functions are pointers, to ensure a stable sort first assign
1853 // each function pointer to its first index in the Calls array, and then use
1854 // that to sort by.
1855 DenseMap<const FuncTy *, unsigned> FuncToIndex;
1856 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1857 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1859 Calls,
1860 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1861 return A.StackIds.size() > B.StackIds.size() ||
1862 (A.StackIds.size() == B.StackIds.size() &&
1863 (A.StackIds < B.StackIds ||
1864 (A.StackIds == B.StackIds &&
1865 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1866 });
1867
1868 // Find the node for the last stack id, which should be the same
1869 // across all calls recorded for this id, and is the id for this
1870 // entry in the StackIdToMatchingCalls map.
1871 uint64_t LastId = It.getFirst();
1872 ContextNode *LastNode = getNodeForStackId(LastId);
1873 // We should only have kept stack ids that had nodes.
1874 assert(LastNode);
1875
1876 if (LastNode->Recursive)
1877 continue;
1878
1879 // Initialize the context ids with the last node's. We will subsequently
1880 // refine the context ids by computing the intersection along all edges.
1881 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1882 assert(!LastNodeContextIds.empty());
1883
1884#ifndef NDEBUG
1885 // Save the set of functions seen for a particular set of the same stack
1886 // ids. This is used to ensure that they have been correctly sorted to be
1887 // adjacent in the Calls list, since we rely on that to efficiently place
1888 // all such matching calls onto the same context node.
1889 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1890#endif
1891
1892 for (unsigned I = 0; I < Calls.size(); I++) {
1893 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1894 assert(SavedContextIds.empty());
1895 assert(LastId == Ids.back());
1896
1897#ifndef NDEBUG
1898 // If this call has a different set of ids than the last one, clear the
1899 // set used to ensure they are sorted properly.
1900 if (I > 0 && Ids != Calls[I - 1].StackIds)
1901 MatchingIdsFuncSet.clear();
1902#endif
1903
1904 // First compute the context ids for this stack id sequence (the
1905 // intersection of the context ids of the corresponding nodes).
1906 // Start with the remaining saved ids for the last node.
1907 assert(!LastNodeContextIds.empty());
1908 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1909
1910 ContextNode *PrevNode = LastNode;
1911 ContextNode *CurNode = LastNode;
1912 bool Skip = false;
1913
1914 // Iterate backwards through the stack Ids, starting after the last Id
1915 // in the list, which was handled once outside for all Calls.
1916 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1917 auto Id = *IdIter;
1918 CurNode = getNodeForStackId(Id);
1919 // We should only have kept stack ids that had nodes.
1920 assert(CurNode);
1921
1922 if (CurNode->Recursive) {
1923 Skip = true;
1924 break;
1925 }
1926
1927 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1928 // If there is no edge then the nodes belong to different MIB contexts,
1929 // and we should skip this inlined context sequence. For example, this
1930 // particular inlined context may include stack ids A->B, and we may
1931 // indeed have nodes for both A and B, but it is possible that they were
1932 // never profiled in sequence in a single MIB for any allocation (i.e.
1933 // we might have profiled an allocation that involves the callsite A,
1934 // but through a different one of its callee callsites, and we might
1935 // have profiled an allocation that involves callsite B, but reached
1936 // from a different caller callsite).
1937 if (!Edge) {
1938 Skip = true;
1939 break;
1940 }
1941 PrevNode = CurNode;
1942
1943 // Update the context ids, which is the intersection of the ids along
1944 // all edges in the sequence.
1945 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1946
1947 // If we now have no context ids for clone, skip this call.
1948 if (StackSequenceContextIds.empty()) {
1949 Skip = true;
1950 break;
1951 }
1952 }
1953 if (Skip)
1954 continue;
1955
1956 // If some of this call's stack ids did not have corresponding nodes (due
1957 // to pruning), don't include any context ids for contexts that extend
1958 // beyond these nodes. Otherwise we would be matching part of unrelated /
1959 // not fully matching stack contexts. To do this, subtract any context ids
1960 // found in caller nodes of the last node found above.
1961 if (Ids.back() != getLastStackId(Call)) {
1962 for (const auto &PE : LastNode->CallerEdges) {
1963 set_subtract(StackSequenceContextIds, PE->getContextIds());
1964 if (StackSequenceContextIds.empty())
1965 break;
1966 }
1967 // If we now have no context ids for clone, skip this call.
1968 if (StackSequenceContextIds.empty())
1969 continue;
1970 }
1971
1972#ifndef NDEBUG
1973 // If the prior call had the same stack ids this set would not be empty.
1974 // Check if we already have a call that "matches" because it is located
1975 // in the same function. If the Calls list was sorted properly we should
1976 // not encounter this situation as all such entries should be adjacent
1977 // and processed in bulk further below.
1978 assert(!MatchingIdsFuncSet.contains(Func));
1979
1980 MatchingIdsFuncSet.insert(Func);
1981#endif
1982
1983 // Check if the next set of stack ids is the same (since the Calls vector
1984 // of tuples is sorted by the stack ids we can just look at the next one).
1985 // If so, save them in the CallToMatchingCall map so that they get
1986 // assigned to the same context node, and skip them.
1987 bool DuplicateContextIds = false;
1988 for (unsigned J = I + 1; J < Calls.size(); J++) {
1989 auto &CallCtxInfo = Calls[J];
1990 auto &NextIds = CallCtxInfo.StackIds;
1991 if (NextIds != Ids)
1992 break;
1993 auto *NextFunc = CallCtxInfo.Func;
1994 if (NextFunc != Func) {
1995 // We have another Call with the same ids but that cannot share this
1996 // node, must duplicate ids for it.
1997 DuplicateContextIds = true;
1998 break;
1999 }
2000 auto &NextCall = CallCtxInfo.Call;
2001 CallToMatchingCall[NextCall] = Call;
2002 // Update I so that it gets incremented correctly to skip this call.
2003 I = J;
2004 }
2005
2006 // If we don't have duplicate context ids, then we can assign all the
2007 // context ids computed for the original node sequence to this call.
2008 // If there are duplicate calls with the same stack ids then we synthesize
2009 // new context ids that are duplicates of the originals. These are
2010 // assigned to SavedContextIds, which is a reference into the map entry
2011 // for this call, allowing us to access these ids later on.
2012 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2013 StackSequenceContextIds.size());
2014 SavedContextIds =
2015 DuplicateContextIds
2016 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2017 : StackSequenceContextIds;
2018 assert(!SavedContextIds.empty());
2019
2020 if (!DuplicateContextIds) {
2021 // Update saved last node's context ids to remove those that are
2022 // assigned to other calls, so that it is ready for the next call at
2023 // this stack id.
2024 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2025 if (LastNodeContextIds.empty())
2026 break;
2027 }
2028 }
2029 }
2030
2031 // Propagate the duplicate context ids over the graph.
2032 propagateDuplicateContextIds(OldToNewContextIds);
2033
2034 if (VerifyCCG)
2035 check();
2036
2037 // Now perform a post-order traversal over the graph, starting with the
2038 // allocation nodes, essentially processing nodes from callers to callees.
2039 // For any that contains an id in the map, update the graph to contain new
2040 // nodes representing any inlining at interior callsites. Note we move the
2041 // associated context ids over to the new nodes.
2042 DenseSet<const ContextNode *> Visited;
2043 for (auto &Entry : AllocationCallToContextNodeMap)
2044 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2045 CallToMatchingCall);
2046 if (VerifyCCG)
2047 check();
2048}
2049
2050uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2051 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2052 Call->getMetadata(LLVMContext::MD_callsite));
2053 return CallsiteContext.back();
2054}
2055
2056uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2058 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2059 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2060 // Need to convert index into stack id.
2061 return Index.getStackIdAtIndex(CallsiteContext.back());
2062}
2063
2064static const std::string MemProfCloneSuffix = ".memprof.";
2065
2066static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2067 // We use CloneNo == 0 to refer to the original version, which doesn't get
2068 // renamed with a suffix.
2069 if (!CloneNo)
2070 return Base.str();
2071 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2072}
2073
2074static bool isMemProfClone(const Function &F) {
2075 return F.getName().contains(MemProfCloneSuffix);
2076}
2077
2078// Return the clone number of the given function by extracting it from the
2079// memprof suffix. Assumes the caller has already confirmed it is a memprof
2080// clone.
2081static unsigned getMemProfCloneNum(const Function &F) {
2083 auto Pos = F.getName().find_last_of('.');
2084 assert(Pos > 0);
2085 unsigned CloneNo;
2086 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2087 assert(!Err);
2088 (void)Err;
2089 return CloneNo;
2090}
2091
2092std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2093 const Instruction *Call,
2094 unsigned CloneNo) const {
2095 return (Twine(Call->getFunction()->getName()) + " -> " +
2096 cast<CallBase>(Call)->getCalledFunction()->getName())
2097 .str();
2098}
2099
2100std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2101 const IndexCall &Call,
2102 unsigned CloneNo) const {
2103 auto VI = FSToVIMap.find(Func);
2104 assert(VI != FSToVIMap.end());
2105 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2107 return CallerName + " -> alloc";
2108 else {
2109 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2110 return CallerName + " -> " +
2111 getMemProfFuncName(Callsite->Callee.name(),
2112 Callsite->Clones[CloneNo]);
2113 }
2114}
2115
2116std::vector<uint64_t>
2117ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2118 Instruction *Call) {
2119 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2120 Call->getMetadata(LLVMContext::MD_callsite));
2121 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2122 CallsiteContext);
2123}
2124
2125std::vector<uint64_t>
2126IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2128 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2129 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2130 return getStackIdsWithContextNodes<CallsiteInfo,
2131 SmallVector<unsigned>::const_iterator>(
2132 CallsiteContext);
2133}
2134
2135template <typename DerivedCCG, typename FuncTy, typename CallTy>
2136template <class NodeT, class IteratorT>
2137std::vector<uint64_t>
2138CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2139 CallStack<NodeT, IteratorT> &CallsiteContext) {
2140 std::vector<uint64_t> StackIds;
2141 for (auto IdOrIndex : CallsiteContext) {
2142 auto StackId = getStackId(IdOrIndex);
2143 ContextNode *Node = getNodeForStackId(StackId);
2144 if (!Node)
2145 break;
2146 StackIds.push_back(StackId);
2147 }
2148 return StackIds;
2149}
2150
2151ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2152 Module &M,
2153 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2154 : Mod(M), OREGetter(OREGetter) {
2155 for (auto &F : M) {
2156 std::vector<CallInfo> CallsWithMetadata;
2157 for (auto &BB : F) {
2158 for (auto &I : BB) {
2159 if (!isa<CallBase>(I))
2160 continue;
2161 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2162 CallsWithMetadata.push_back(&I);
2163 auto *AllocNode = addAllocNode(&I, &F);
2164 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2165 assert(CallsiteMD);
2166 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2167 // Add all of the MIBs and their stack nodes.
2168 for (auto &MDOp : MemProfMD->operands()) {
2169 auto *MIBMD = cast<const MDNode>(MDOp);
2170 std::vector<ContextTotalSize> ContextSizeInfo;
2171 // Collect the context size information if it exists.
2172 if (MIBMD->getNumOperands() > 2) {
2173 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2174 MDNode *ContextSizePair =
2175 dyn_cast<MDNode>(MIBMD->getOperand(I));
2176 assert(ContextSizePair->getNumOperands() == 2);
2178 ContextSizePair->getOperand(0))
2179 ->getZExtValue();
2181 ContextSizePair->getOperand(1))
2182 ->getZExtValue();
2183 ContextSizeInfo.push_back({FullStackId, TotalSize});
2184 }
2185 }
2189 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2190 AllocNode, StackContext, CallsiteContext,
2191 getMIBAllocType(MIBMD), ContextSizeInfo);
2192 }
2193 // If exporting the graph to dot and an allocation id of interest was
2194 // specified, record all the context ids for this allocation node.
2195 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2196 DotAllocContextIds = AllocNode->getContextIds();
2197 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2198 // Memprof and callsite metadata on memory allocations no longer
2199 // needed.
2200 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2201 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2202 }
2203 // For callsite metadata, add to list for this function for later use.
2204 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2205 CallsWithMetadata.push_back(&I);
2206 }
2207 }
2208 }
2209 if (!CallsWithMetadata.empty())
2210 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2211 }
2212
2213 if (DumpCCG) {
2214 dbgs() << "CCG before updating call stack chains:\n";
2215 dbgs() << *this;
2216 }
2217
2218 if (ExportToDot)
2219 exportToDot("prestackupdate");
2220
2221 updateStackNodes();
2222
2223 if (ExportToDot)
2224 exportToDot("poststackupdate");
2225
2226 handleCallsitesWithMultipleTargets();
2227
2228 markBackedges();
2229
2230 // Strip off remaining callsite metadata, no longer needed.
2231 for (auto &FuncEntry : FuncToCallsWithMetadata)
2232 for (auto &Call : FuncEntry.second)
2233 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2234}
2235
2236IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2237 ModuleSummaryIndex &Index,
2239 isPrevailing)
2240 : Index(Index), isPrevailing(isPrevailing) {
2241 for (auto &I : Index) {
2242 auto VI = Index.getValueInfo(I);
2243 for (auto &S : VI.getSummaryList()) {
2244 // We should only add the prevailing nodes. Otherwise we may try to clone
2245 // in a weak copy that won't be linked (and may be different than the
2246 // prevailing version).
2247 // We only keep the memprof summary on the prevailing copy now when
2248 // building the combined index, as a space optimization, however don't
2249 // rely on this optimization. The linker doesn't resolve local linkage
2250 // values so don't check whether those are prevailing.
2251 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2252 !isPrevailing(VI.getGUID(), S.get()))
2253 continue;
2254 auto *FS = dyn_cast<FunctionSummary>(S.get());
2255 if (!FS)
2256 continue;
2257 std::vector<CallInfo> CallsWithMetadata;
2258 if (!FS->allocs().empty()) {
2259 for (auto &AN : FS->mutableAllocs()) {
2260 // This can happen because of recursion elimination handling that
2261 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2262 // We still added them to the summary because we need to be able to
2263 // correlate properly in applyImport in the backends.
2264 if (AN.MIBs.empty())
2265 continue;
2266 IndexCall AllocCall(&AN);
2267 CallsWithMetadata.push_back(AllocCall);
2268 auto *AllocNode = addAllocNode(AllocCall, FS);
2269 // Pass an empty CallStack to the CallsiteContext (second)
2270 // parameter, since for ThinLTO we already collapsed out the inlined
2271 // stack ids on the allocation call during ModuleSummaryAnalysis.
2273 EmptyContext;
2274 unsigned I = 0;
2276 AN.ContextSizeInfos.size() == AN.MIBs.size());
2277 // Now add all of the MIBs and their stack nodes.
2278 for (auto &MIB : AN.MIBs) {
2280 StackContext(&MIB);
2281 std::vector<ContextTotalSize> ContextSizeInfo;
2282 if (!AN.ContextSizeInfos.empty()) {
2283 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2284 ContextSizeInfo.push_back({FullStackId, TotalSize});
2285 }
2286 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2287 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2288 ContextSizeInfo);
2289 I++;
2290 }
2291 // If exporting the graph to dot and an allocation id of interest was
2292 // specified, record all the context ids for this allocation node.
2293 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2294 DotAllocContextIds = AllocNode->getContextIds();
2295 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2296 // Initialize version 0 on the summary alloc node to the current alloc
2297 // type, unless it has both types in which case make it default, so
2298 // that in the case where we aren't able to clone the original version
2299 // always ends up with the default allocation behavior.
2300 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2301 }
2302 }
2303 // For callsite metadata, add to list for this function for later use.
2304 if (!FS->callsites().empty())
2305 for (auto &SN : FS->mutableCallsites()) {
2306 IndexCall StackNodeCall(&SN);
2307 CallsWithMetadata.push_back(StackNodeCall);
2308 }
2309
2310 if (!CallsWithMetadata.empty())
2311 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2312
2313 if (!FS->allocs().empty() || !FS->callsites().empty())
2314 FSToVIMap[FS] = VI;
2315 }
2316 }
2317
2318 if (DumpCCG) {
2319 dbgs() << "CCG before updating call stack chains:\n";
2320 dbgs() << *this;
2321 }
2322
2323 if (ExportToDot)
2324 exportToDot("prestackupdate");
2325
2326 updateStackNodes();
2327
2328 if (ExportToDot)
2329 exportToDot("poststackupdate");
2330
2331 handleCallsitesWithMultipleTargets();
2332
2333 markBackedges();
2334}
2335
2336template <typename DerivedCCG, typename FuncTy, typename CallTy>
2337void CallsiteContextGraph<DerivedCCG, FuncTy,
2338 CallTy>::handleCallsitesWithMultipleTargets() {
2339 // Look for and workaround callsites that call multiple functions.
2340 // This can happen for indirect calls, which needs better handling, and in
2341 // more rare cases (e.g. macro expansion).
2342 // TODO: To fix this for indirect calls we will want to perform speculative
2343 // devirtualization using either the normal PGO info with ICP, or using the
2344 // information in the profiled MemProf contexts. We can do this prior to
2345 // this transformation for regular LTO, and for ThinLTO we can simulate that
2346 // effect in the summary and perform the actual speculative devirtualization
2347 // while cloning in the ThinLTO backend.
2348
2349 // Keep track of the new nodes synthesized for discovered tail calls missing
2350 // from the profiled contexts.
2351 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2352
2353 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2354 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2355 auto *Node = Entry.second;
2356 assert(Node->Clones.empty());
2357 // Check all node callees and see if in the same function.
2358 // We need to check all of the calls recorded in this Node, because in some
2359 // cases we may have had multiple calls with the same debug info calling
2360 // different callees. This can happen, for example, when an object is
2361 // constructed in the paramter list - the destructor call of the object has
2362 // the same debug info (line/col) as the call the object was passed to.
2363 // Here we will prune any that don't match all callee nodes.
2364 std::vector<CallInfo> AllCalls;
2365 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2366 AllCalls.push_back(Node->Call);
2367 llvm::append_range(AllCalls, Node->MatchingCalls);
2368
2369 // First see if we can partition the calls by callee function, creating new
2370 // nodes to host each set of calls calling the same callees. This is
2371 // necessary for support indirect calls with ThinLTO, for which we
2372 // synthesized CallsiteInfo records for each target. They will all have the
2373 // same callsite stack ids and would be sharing a context node at this
2374 // point. We need to perform separate cloning for each, which will be
2375 // applied along with speculative devirtualization in the ThinLTO backends
2376 // as needed. Note this does not currently support looking through tail
2377 // calls, it is unclear if we need that for indirect call targets.
2378 // First partition calls by callee func. Map indexed by func, value is
2379 // struct with list of matching calls, assigned node.
2380 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2381 continue;
2382
2383 auto It = AllCalls.begin();
2384 // Iterate through the calls until we find the first that matches.
2385 for (; It != AllCalls.end(); ++It) {
2386 auto ThisCall = *It;
2387 bool Match = true;
2388 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2389 ++EI) {
2390 auto Edge = *EI;
2391 if (!Edge->Callee->hasCall())
2392 continue;
2393 assert(NodeToCallingFunc.count(Edge->Callee));
2394 // Check if the called function matches that of the callee node.
2395 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2396 Match = false;
2397 break;
2398 }
2399 }
2400 // Found a call that matches the callee nodes, we can quit now.
2401 if (Match) {
2402 // If the first match is not the primary call on the Node, update it
2403 // now. We will update the list of matching calls further below.
2404 if (Node->Call != ThisCall) {
2405 Node->setCall(ThisCall);
2406 // We need to update the NonAllocationCallToContextNodeMap, but don't
2407 // want to do this during iteration over that map, so save the calls
2408 // that need updated entries.
2409 NewCallToNode.push_back({ThisCall, Node});
2410 }
2411 break;
2412 }
2413 }
2414 // We will update this list below (or leave it cleared if there was no
2415 // match found above).
2416 Node->MatchingCalls.clear();
2417 // If we hit the end of the AllCalls vector, no call matching the callee
2418 // nodes was found, clear the call information in the node.
2419 if (It == AllCalls.end()) {
2420 RemovedEdgesWithMismatchedCallees++;
2421 // Work around by setting Node to have a null call, so it gets
2422 // skipped during cloning. Otherwise assignFunctions will assert
2423 // because its data structures are not designed to handle this case.
2424 Node->setCall(CallInfo());
2425 continue;
2426 }
2427 // Now add back any matching calls that call the same function as the
2428 // matching primary call on Node.
2429 for (++It; It != AllCalls.end(); ++It) {
2430 auto ThisCall = *It;
2431 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2432 continue;
2433 Node->MatchingCalls.push_back(ThisCall);
2434 }
2435 }
2436
2437 // Remove all mismatched nodes identified in the above loop from the node map
2438 // (checking whether they have a null call which is set above). For a
2439 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2440 // to do the removal via remove_if than by individually erasing entries above.
2441 // Also remove any entries if we updated the node's primary call above.
2442 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2443 return !it.second->hasCall() || it.second->Call != it.first;
2444 });
2445
2446 // Add entries for any new primary calls recorded above.
2447 for (auto &[Call, Node] : NewCallToNode)
2448 NonAllocationCallToContextNodeMap[Call] = Node;
2449
2450 // Add the new nodes after the above loop so that the iteration is not
2451 // invalidated.
2452 for (auto &[Call, Node] : TailCallToContextNodeMap)
2453 NonAllocationCallToContextNodeMap[Call] = Node;
2454}
2455
2456template <typename DerivedCCG, typename FuncTy, typename CallTy>
2457bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2458 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2459 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2460 // Struct to keep track of all the calls having the same callee function,
2461 // and the node we eventually assign to them. Eventually we will record the
2462 // context node assigned to this group of calls.
2463 struct CallsWithSameCallee {
2464 std::vector<CallInfo> Calls;
2465 ContextNode *Node = nullptr;
2466 };
2467
2468 // First partition calls by callee function. Build map from each function
2469 // to the list of matching calls.
2471 for (auto ThisCall : AllCalls) {
2472 auto *F = getCalleeFunc(ThisCall.call());
2473 if (F)
2474 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2475 }
2476
2477 // Next, walk through all callee edges. For each callee node, get its
2478 // containing function and see if it was recorded in the above map (meaning we
2479 // have at least one matching call). Build another map from each callee node
2480 // with a matching call to the structure instance created above containing all
2481 // the calls.
2483 for (const auto &Edge : Node->CalleeEdges) {
2484 if (!Edge->Callee->hasCall())
2485 continue;
2486 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2487 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2488 CalleeNodeToCallInfo[Edge->Callee] =
2489 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2490 }
2491
2492 // If there are entries in the second map, then there were no matching
2493 // calls/callees, nothing to do here. Return so we can go to the handling that
2494 // looks through tail calls.
2495 if (CalleeNodeToCallInfo.empty())
2496 return false;
2497
2498 // Walk through all callee edges again. Any and all callee edges that didn't
2499 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2500 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2501 // ignored during cloning. If it is in the map, then we use the node recorded
2502 // in that entry (creating it if needed), and move the callee edge to it.
2503 // The first callee will use the original node instead of creating a new one.
2504 // Note that any of the original calls on this node (in AllCalls) that didn't
2505 // have a callee function automatically get dropped from the node as part of
2506 // this process.
2507 ContextNode *UnmatchedCalleesNode = nullptr;
2508 // Track whether we already assigned original node to a callee.
2509 bool UsedOrigNode = false;
2510 assert(NodeToCallingFunc[Node]);
2511 // Iterate over a copy of Node's callee edges, since we may need to remove
2512 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2513 // makes it less error-prone.
2514 auto CalleeEdges = Node->CalleeEdges;
2515 for (auto &Edge : CalleeEdges) {
2516 if (!Edge->Callee->hasCall())
2517 continue;
2518
2519 // Will be updated below to point to whatever (caller) node this callee edge
2520 // should be moved to.
2521 ContextNode *CallerNodeToUse = nullptr;
2522
2523 // Handle the case where there were no matching calls first. Move this
2524 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2525 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2526 if (!UnmatchedCalleesNode)
2527 UnmatchedCalleesNode =
2528 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2529 CallerNodeToUse = UnmatchedCalleesNode;
2530 } else {
2531 // Look up the information recorded for this callee node, and use the
2532 // recorded caller node (creating it if needed).
2533 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2534 if (!Info->Node) {
2535 // If we haven't assigned any callees to the original node use it.
2536 if (!UsedOrigNode) {
2537 Info->Node = Node;
2538 // Clear the set of matching calls which will be updated below.
2539 Node->MatchingCalls.clear();
2540 UsedOrigNode = true;
2541 } else
2542 Info->Node =
2543 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2544 assert(!Info->Calls.empty());
2545 // The first call becomes the primary call for this caller node, and the
2546 // rest go in the matching calls list.
2547 Info->Node->setCall(Info->Calls.front());
2548 llvm::append_range(Info->Node->MatchingCalls,
2549 llvm::drop_begin(Info->Calls));
2550 // Save the primary call to node correspondence so that we can update
2551 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2552 // caller of this function.
2553 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2554 }
2555 CallerNodeToUse = Info->Node;
2556 }
2557
2558 // Don't need to move edge if we are using the original node;
2559 if (CallerNodeToUse == Node)
2560 continue;
2561
2562 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2563 }
2564 // Now that we are done moving edges, clean up any caller edges that ended
2565 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2566 // caller edges from Node are replicated onto the new callers, and it
2567 // simplifies the handling to leave them until we have moved all
2568 // edges/context ids.
2569 for (auto &I : CalleeNodeToCallInfo)
2570 removeNoneTypeCallerEdges(I.second->Node);
2571 if (UnmatchedCalleesNode)
2572 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2573 removeNoneTypeCallerEdges(Node);
2574
2575 return true;
2576}
2577
2578uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2579 // In the Module (IR) case this is already the Id.
2580 return IdOrIndex;
2581}
2582
2583uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2584 // In the Index case this is an index into the stack id list in the summary
2585 // index, convert it to an Id.
2586 return Index.getStackIdAtIndex(IdOrIndex);
2587}
2588
2589template <typename DerivedCCG, typename FuncTy, typename CallTy>
2590bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2591 CallTy Call, EdgeIter &EI,
2592 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2593 auto Edge = *EI;
2594 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2595 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2596 // Will be populated in order of callee to caller if we find a chain of tail
2597 // calls between the profiled caller and callee.
2598 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2599 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2600 FoundCalleeChain))
2601 return false;
2602
2603 // The usual case where the profiled callee matches that of the IR/summary.
2604 if (FoundCalleeChain.empty())
2605 return true;
2606
2607 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2608 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2609 // If there is already an edge between these nodes, simply update it and
2610 // return.
2611 if (CurEdge) {
2612 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2613 CurEdge->AllocTypes |= Edge->AllocTypes;
2614 return;
2615 }
2616 // Otherwise, create a new edge and insert it into the caller and callee
2617 // lists.
2618 auto NewEdge = std::make_shared<ContextEdge>(
2619 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2620 Callee->CallerEdges.push_back(NewEdge);
2621 if (Caller == Edge->Caller) {
2622 // If we are inserting the new edge into the current edge's caller, insert
2623 // the new edge before the current iterator position, and then increment
2624 // back to the current edge.
2625 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2626 ++EI;
2627 assert(*EI == Edge &&
2628 "Iterator position not restored after insert and increment");
2629 } else
2630 Caller->CalleeEdges.push_back(NewEdge);
2631 };
2632
2633 // Create new nodes for each found callee and connect in between the profiled
2634 // caller and callee.
2635 auto *CurCalleeNode = Edge->Callee;
2636 for (auto &[NewCall, Func] : FoundCalleeChain) {
2637 ContextNode *NewNode = nullptr;
2638 // First check if we have already synthesized a node for this tail call.
2639 if (TailCallToContextNodeMap.count(NewCall)) {
2640 NewNode = TailCallToContextNodeMap[NewCall];
2641 NewNode->AllocTypes |= Edge->AllocTypes;
2642 } else {
2643 FuncToCallsWithMetadata[Func].push_back({NewCall});
2644 // Create Node and record node info.
2645 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2646 TailCallToContextNodeMap[NewCall] = NewNode;
2647 NewNode->AllocTypes = Edge->AllocTypes;
2648 }
2649
2650 // Hook up node to its callee node
2651 AddEdge(NewNode, CurCalleeNode);
2652
2653 CurCalleeNode = NewNode;
2654 }
2655
2656 // Hook up edge's original caller to new callee node.
2657 AddEdge(Edge->Caller, CurCalleeNode);
2658
2659#ifndef NDEBUG
2660 // Save this because Edge's fields get cleared below when removed.
2661 auto *Caller = Edge->Caller;
2662#endif
2663
2664 // Remove old edge
2665 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2666
2667 // To simplify the increment of EI in the caller, subtract one from EI.
2668 // In the final AddEdge call we would have either added a new callee edge,
2669 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2670 // that there is at least one callee edge.
2671 assert(!Caller->CalleeEdges.empty());
2672 --EI;
2673
2674 return true;
2675}
2676
2677bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2678 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2679 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2680 bool &FoundMultipleCalleeChains) {
2681 // Stop recursive search if we have already explored the maximum specified
2682 // depth.
2684 return false;
2685
2686 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2687 FoundCalleeChain.push_back({Callsite, F});
2688 };
2689
2690 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2691 if (!CalleeFunc) {
2692 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2693 assert(Alias);
2694 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2695 assert(CalleeFunc);
2696 }
2697
2698 // Look for tail calls in this function, and check if they either call the
2699 // profiled callee directly, or indirectly (via a recursive search).
2700 // Only succeed if there is a single unique tail call chain found between the
2701 // profiled caller and callee, otherwise we could perform incorrect cloning.
2702 bool FoundSingleCalleeChain = false;
2703 for (auto &BB : *CalleeFunc) {
2704 for (auto &I : BB) {
2705 auto *CB = dyn_cast<CallBase>(&I);
2706 if (!CB || !CB->isTailCall())
2707 continue;
2708 auto *CalledValue = CB->getCalledOperand();
2709 auto *CalledFunction = CB->getCalledFunction();
2710 if (CalledValue && !CalledFunction) {
2711 CalledValue = CalledValue->stripPointerCasts();
2712 // Stripping pointer casts can reveal a called function.
2713 CalledFunction = dyn_cast<Function>(CalledValue);
2714 }
2715 // Check if this is an alias to a function. If so, get the
2716 // called aliasee for the checks below.
2717 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2718 assert(!CalledFunction &&
2719 "Expected null called function in callsite for alias");
2720 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2721 }
2722 if (!CalledFunction)
2723 continue;
2724 if (CalledFunction == ProfiledCallee) {
2725 if (FoundSingleCalleeChain) {
2726 FoundMultipleCalleeChains = true;
2727 return false;
2728 }
2729 FoundSingleCalleeChain = true;
2730 FoundProfiledCalleeCount++;
2731 FoundProfiledCalleeDepth += Depth;
2732 if (Depth > FoundProfiledCalleeMaxDepth)
2733 FoundProfiledCalleeMaxDepth = Depth;
2734 SaveCallsiteInfo(&I, CalleeFunc);
2735 } else if (findProfiledCalleeThroughTailCalls(
2736 ProfiledCallee, CalledFunction, Depth + 1,
2737 FoundCalleeChain, FoundMultipleCalleeChains)) {
2738 // findProfiledCalleeThroughTailCalls should not have returned
2739 // true if FoundMultipleCalleeChains.
2740 assert(!FoundMultipleCalleeChains);
2741 if (FoundSingleCalleeChain) {
2742 FoundMultipleCalleeChains = true;
2743 return false;
2744 }
2745 FoundSingleCalleeChain = true;
2746 SaveCallsiteInfo(&I, CalleeFunc);
2747 } else if (FoundMultipleCalleeChains)
2748 return false;
2749 }
2750 }
2751
2752 return FoundSingleCalleeChain;
2753}
2754
2755const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2756 auto *CB = dyn_cast<CallBase>(Call);
2757 if (!CB->getCalledOperand() || CB->isIndirectCall())
2758 return nullptr;
2759 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2760 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2761 if (Alias)
2762 return dyn_cast<Function>(Alias->getAliasee());
2763 return dyn_cast<Function>(CalleeVal);
2764}
2765
2766bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2767 Instruction *Call, const Function *Func, const Function *CallerFunc,
2768 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2769 auto *CB = dyn_cast<CallBase>(Call);
2770 if (!CB->getCalledOperand() || CB->isIndirectCall())
2771 return false;
2772 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2773 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2774 if (CalleeFunc == Func)
2775 return true;
2776 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2777 if (Alias && Alias->getAliasee() == Func)
2778 return true;
2779
2780 // Recursively search for the profiled callee through tail calls starting with
2781 // the actual Callee. The discovered tail call chain is saved in
2782 // FoundCalleeChain, and we will fixup the graph to include these callsites
2783 // after returning.
2784 // FIXME: We will currently redo the same recursive walk if we find the same
2785 // mismatched callee from another callsite. We can improve this with more
2786 // bookkeeping of the created chain of new nodes for each mismatch.
2787 unsigned Depth = 1;
2788 bool FoundMultipleCalleeChains = false;
2789 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2790 FoundCalleeChain,
2791 FoundMultipleCalleeChains)) {
2792 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2793 << Func->getName() << " from " << CallerFunc->getName()
2794 << " that actually called " << CalleeVal->getName()
2795 << (FoundMultipleCalleeChains
2796 ? " (found multiple possible chains)"
2797 : "")
2798 << "\n");
2799 if (FoundMultipleCalleeChains)
2800 FoundProfiledCalleeNonUniquelyCount++;
2801 return false;
2802 }
2803
2804 return true;
2805}
2806
2807bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2808 Instruction *Call2) {
2809 auto *CB1 = cast<CallBase>(Call1);
2810 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2811 return false;
2812 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2813 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2814 auto *CB2 = cast<CallBase>(Call2);
2815 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2816 return false;
2817 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2818 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2819 return CalleeFunc1 == CalleeFunc2;
2820}
2821
2822bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2823 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2824 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2825 bool &FoundMultipleCalleeChains) {
2826 // Stop recursive search if we have already explored the maximum specified
2827 // depth.
2829 return false;
2830
2831 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2832 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2833 // been synthesized.
2834 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2835 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2836 // StackIds is empty (we don't have debug info available in the index for
2837 // these callsites)
2838 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2839 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2840 CallsiteInfo *NewCallsiteInfo =
2841 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2842 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2843 };
2844
2845 // Look for tail calls in this function, and check if they either call the
2846 // profiled callee directly, or indirectly (via a recursive search).
2847 // Only succeed if there is a single unique tail call chain found between the
2848 // profiled caller and callee, otherwise we could perform incorrect cloning.
2849 bool FoundSingleCalleeChain = false;
2850 for (auto &S : CurCallee.getSummaryList()) {
2851 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2852 !isPrevailing(CurCallee.getGUID(), S.get()))
2853 continue;
2854 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2855 if (!FS)
2856 continue;
2857 auto FSVI = CurCallee;
2858 auto *AS = dyn_cast<AliasSummary>(S.get());
2859 if (AS)
2860 FSVI = AS->getAliaseeVI();
2861 for (auto &CallEdge : FS->calls()) {
2862 if (!CallEdge.second.hasTailCall())
2863 continue;
2864 if (CallEdge.first == ProfiledCallee) {
2865 if (FoundSingleCalleeChain) {
2866 FoundMultipleCalleeChains = true;
2867 return false;
2868 }
2869 FoundSingleCalleeChain = true;
2870 FoundProfiledCalleeCount++;
2871 FoundProfiledCalleeDepth += Depth;
2872 if (Depth > FoundProfiledCalleeMaxDepth)
2873 FoundProfiledCalleeMaxDepth = Depth;
2874 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2875 // Add FS to FSToVIMap in case it isn't already there.
2876 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2877 FSToVIMap[FS] = FSVI;
2878 } else if (findProfiledCalleeThroughTailCalls(
2879 ProfiledCallee, CallEdge.first, Depth + 1,
2880 FoundCalleeChain, FoundMultipleCalleeChains)) {
2881 // findProfiledCalleeThroughTailCalls should not have returned
2882 // true if FoundMultipleCalleeChains.
2883 assert(!FoundMultipleCalleeChains);
2884 if (FoundSingleCalleeChain) {
2885 FoundMultipleCalleeChains = true;
2886 return false;
2887 }
2888 FoundSingleCalleeChain = true;
2889 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2890 // Add FS to FSToVIMap in case it isn't already there.
2891 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2892 FSToVIMap[FS] = FSVI;
2893 } else if (FoundMultipleCalleeChains)
2894 return false;
2895 }
2896 }
2897
2898 return FoundSingleCalleeChain;
2899}
2900
2901const FunctionSummary *
2902IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2903 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2904 if (Callee.getSummaryList().empty())
2905 return nullptr;
2906 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2907}
2908
2909bool IndexCallsiteContextGraph::calleeMatchesFunc(
2910 IndexCall &Call, const FunctionSummary *Func,
2911 const FunctionSummary *CallerFunc,
2912 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2913 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2914 // If there is no summary list then this is a call to an externally defined
2915 // symbol.
2916 AliasSummary *Alias =
2917 Callee.getSummaryList().empty()
2918 ? nullptr
2919 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2920 assert(FSToVIMap.count(Func));
2921 auto FuncVI = FSToVIMap[Func];
2922 if (Callee == FuncVI ||
2923 // If callee is an alias, check the aliasee, since only function
2924 // summary base objects will contain the stack node summaries and thus
2925 // get a context node.
2926 (Alias && Alias->getAliaseeVI() == FuncVI))
2927 return true;
2928
2929 // Recursively search for the profiled callee through tail calls starting with
2930 // the actual Callee. The discovered tail call chain is saved in
2931 // FoundCalleeChain, and we will fixup the graph to include these callsites
2932 // after returning.
2933 // FIXME: We will currently redo the same recursive walk if we find the same
2934 // mismatched callee from another callsite. We can improve this with more
2935 // bookkeeping of the created chain of new nodes for each mismatch.
2936 unsigned Depth = 1;
2937 bool FoundMultipleCalleeChains = false;
2938 if (!findProfiledCalleeThroughTailCalls(
2939 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2940 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2941 << " from " << FSToVIMap[CallerFunc]
2942 << " that actually called " << Callee
2943 << (FoundMultipleCalleeChains
2944 ? " (found multiple possible chains)"
2945 : "")
2946 << "\n");
2947 if (FoundMultipleCalleeChains)
2948 FoundProfiledCalleeNonUniquelyCount++;
2949 return false;
2950 }
2951
2952 return true;
2953}
2954
2955bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2956 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2957 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2958 return Callee1 == Callee2;
2959}
2960
2961template <typename DerivedCCG, typename FuncTy, typename CallTy>
2962void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2963 const {
2964 print(dbgs());
2965 dbgs() << "\n";
2966}
2967
2968template <typename DerivedCCG, typename FuncTy, typename CallTy>
2969void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2970 raw_ostream &OS) const {
2971 OS << "Node " << this << "\n";
2972 OS << "\t";
2973 printCall(OS);
2974 if (Recursive)
2975 OS << " (recursive)";
2976 OS << "\n";
2977 if (!MatchingCalls.empty()) {
2978 OS << "\tMatchingCalls:\n";
2979 for (auto &MatchingCall : MatchingCalls) {
2980 OS << "\t";
2981 MatchingCall.print(OS);
2982 OS << "\n";
2983 }
2984 }
2985 OS << "\tNodeId: " << NodeId << "\n";
2986 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2987 OS << "\tContextIds:";
2988 // Make a copy of the computed context ids that we can sort for stability.
2989 auto ContextIds = getContextIds();
2990 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2991 std::sort(SortedIds.begin(), SortedIds.end());
2992 for (auto Id : SortedIds)
2993 OS << " " << Id;
2994 OS << "\n";
2995 OS << "\tCalleeEdges:\n";
2996 for (auto &Edge : CalleeEdges)
2997 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
2998 << ")\n";
2999 OS << "\tCallerEdges:\n";
3000 for (auto &Edge : CallerEdges)
3001 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3002 << ")\n";
3003 if (!Clones.empty()) {
3004 OS << "\tClones: ";
3005 bool First = true;
3006 for (auto *C : Clones) {
3007 if (!First)
3008 OS << ", ";
3009 First = false;
3010 OS << C << " NodeId: " << C->NodeId;
3011 }
3012 OS << "\n";
3013 } else if (CloneOf) {
3014 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3015 }
3016}
3017
3018template <typename DerivedCCG, typename FuncTy, typename CallTy>
3019void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3020 const {
3021 print(dbgs());
3022 dbgs() << "\n";
3023}
3024
3025template <typename DerivedCCG, typename FuncTy, typename CallTy>
3026void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3027 raw_ostream &OS) const {
3028 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3029 << (IsBackedge ? " (BE)" : "")
3030 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3031 OS << " ContextIds:";
3032 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3033 std::sort(SortedIds.begin(), SortedIds.end());
3034 for (auto Id : SortedIds)
3035 OS << " " << Id;
3036}
3037
3038template <typename DerivedCCG, typename FuncTy, typename CallTy>
3039void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3040 print(dbgs());
3041}
3042
3043template <typename DerivedCCG, typename FuncTy, typename CallTy>
3044void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3045 raw_ostream &OS) const {
3046 OS << "Callsite Context Graph:\n";
3047 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3048 for (const auto Node : nodes<GraphType>(this)) {
3049 if (Node->isRemoved())
3050 continue;
3051 Node->print(OS);
3052 OS << "\n";
3053 }
3054}
3055
3056template <typename DerivedCCG, typename FuncTy, typename CallTy>
3057void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3058 raw_ostream &OS) const {
3059 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3060 for (const auto Node : nodes<GraphType>(this)) {
3061 if (Node->isRemoved())
3062 continue;
3063 if (!Node->IsAllocation)
3064 continue;
3065 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3066 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3067 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3068 std::sort(SortedIds.begin(), SortedIds.end());
3069 for (auto Id : SortedIds) {
3070 auto TypeI = ContextIdToAllocationType.find(Id);
3071 assert(TypeI != ContextIdToAllocationType.end());
3072 auto CSI = ContextIdToContextSizeInfos.find(Id);
3073 if (CSI != ContextIdToContextSizeInfos.end()) {
3074 for (auto &Info : CSI->second) {
3075 OS << "MemProf hinting: "
3076 << getAllocTypeString((uint8_t)TypeI->second)
3077 << " full allocation context " << Info.FullStackId
3078 << " with total size " << Info.TotalSize << " is "
3079 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3080 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3081 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3082 << " due to cold byte percent";
3083 // Print the internal context id to aid debugging and visualization.
3084 OS << " (context id " << Id << ")";
3085 OS << "\n";
3086 }
3087 }
3088 }
3089 }
3090}
3091
3092template <typename DerivedCCG, typename FuncTy, typename CallTy>
3093void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3094 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3095 for (const auto Node : nodes<GraphType>(this)) {
3096 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3097 for (auto &Edge : Node->CallerEdges)
3099 }
3100}
3101
3102template <typename DerivedCCG, typename FuncTy, typename CallTy>
3103struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3104 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3105 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3106
3107 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3108 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3109
3112 decltype(&getNode)>;
3113
3115 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3116 }
3117
3119 return nodes_iterator(G->NodeOwner.end(), &getNode);
3120 }
3121
3123 return G->NodeOwner.begin()->get();
3124 }
3125
3126 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3127 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3129 return P->Callee;
3130 }
3131
3133 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3134 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3135 decltype(&GetCallee)>;
3136
3138 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3139 }
3140
3142 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3143 }
3144};
3145
3146template <typename DerivedCCG, typename FuncTy, typename CallTy>
3147struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3148 : public DefaultDOTGraphTraits {
3149 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3150 // If the user requested the full graph to be exported, but provided an
3151 // allocation id, or if the user gave a context id and requested more than
3152 // just a specific context to be exported, note that highlighting is
3153 // enabled.
3154 DoHighlight =
3155 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3156 (ContextIdForDot.getNumOccurrences() &&
3158 }
3159
3160 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3162 using NodeRef = typename GTraits::NodeRef;
3163 using ChildIteratorType = typename GTraits::ChildIteratorType;
3164
3165 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3166 std::string LabelString =
3167 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3168 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3169 .str();
3170 LabelString += "\n";
3171 if (Node->hasCall()) {
3172 auto Func = G->NodeToCallingFunc.find(Node);
3173 assert(Func != G->NodeToCallingFunc.end());
3174 LabelString +=
3175 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3176 } else {
3177 LabelString += "null call";
3178 if (Node->Recursive)
3179 LabelString += " (recursive)";
3180 else
3181 LabelString += " (external)";
3182 }
3183 return LabelString;
3184 }
3185
3187 auto ContextIds = Node->getContextIds();
3188 // If highlighting enabled, see if this node contains any of the context ids
3189 // of interest. If so, it will use a different color and a larger fontsize
3190 // (which makes the node larger as well).
3191 bool Highlight = false;
3192 if (DoHighlight) {
3193 assert(ContextIdForDot.getNumOccurrences() ||
3194 AllocIdForDot.getNumOccurrences());
3195 if (ContextIdForDot.getNumOccurrences())
3196 Highlight = ContextIds.contains(ContextIdForDot);
3197 else
3198 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3199 }
3200 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3201 getContextIds(ContextIds) + "\"")
3202 .str();
3203 // Default fontsize is 14
3204 if (Highlight)
3205 AttributeString += ",fontsize=\"30\"";
3206 AttributeString +=
3207 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3208 .str();
3209 if (Node->CloneOf) {
3210 AttributeString += ",color=\"blue\"";
3211 AttributeString += ",style=\"filled,bold,dashed\"";
3212 } else
3213 AttributeString += ",style=\"filled\"";
3214 return AttributeString;
3215 }
3216
3217 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3218 GraphType G) {
3219 auto &Edge = *(ChildIter.getCurrent());
3220 // If highlighting enabled, see if this edge contains any of the context ids
3221 // of interest. If so, it will use a different color and a heavier arrow
3222 // size and weight (the larger weight makes the highlighted path
3223 // straighter).
3224 bool Highlight = false;
3225 if (DoHighlight) {
3226 assert(ContextIdForDot.getNumOccurrences() ||
3227 AllocIdForDot.getNumOccurrences());
3228 if (ContextIdForDot.getNumOccurrences())
3229 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3230 else
3231 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3232 }
3233 auto Color = getColor(Edge->AllocTypes, Highlight);
3234 std::string AttributeString =
3235 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3236 // fillcolor is the arrow head and color is the line
3237 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3238 "\"")
3239 .str();
3240 if (Edge->IsBackedge)
3241 AttributeString += ",style=\"dotted\"";
3242 // Default penwidth and weight are both 1.
3243 if (Highlight)
3244 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3245 return AttributeString;
3246 }
3247
3248 // Since the NodeOwners list includes nodes that are no longer connected to
3249 // the graph, skip them here.
3251 if (Node->isRemoved())
3252 return true;
3253 // If a scope smaller than the full graph was requested, see if this node
3254 // contains any of the context ids of interest.
3256 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3258 return !Node->getContextIds().contains(ContextIdForDot);
3259 return false;
3260 }
3261
3262private:
3263 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3264 std::string IdString = "ContextIds:";
3265 if (ContextIds.size() < 100) {
3266 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3267 std::sort(SortedIds.begin(), SortedIds.end());
3268 for (auto Id : SortedIds)
3269 IdString += (" " + Twine(Id)).str();
3270 } else {
3271 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3272 }
3273 return IdString;
3274 }
3275
3276 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3277 // If DoHighlight is not enabled, we want to use the highlight colors for
3278 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3279 // both compatible with the color scheme before highlighting was supported,
3280 // and for the NotCold+Cold color the non-highlight color is a bit more
3281 // readable.
3282 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3283 // Color "brown1" actually looks like a lighter red.
3284 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3285 if (AllocTypes == (uint8_t)AllocationType::Cold)
3286 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3287 if (AllocTypes ==
3288 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3289 return Highlight ? "magenta" : "mediumorchid1";
3290 return "gray";
3291 }
3292
3293 static std::string getNodeId(NodeRef Node) {
3294 std::stringstream SStream;
3295 SStream << std::hex << "N0x" << (unsigned long long)Node;
3296 std::string Result = SStream.str();
3297 return Result;
3298 }
3299
3300 // True if we should highlight a specific context or allocation's contexts in
3301 // the emitted graph.
3302 static bool DoHighlight;
3303};
3304
3305template <typename DerivedCCG, typename FuncTy, typename CallTy>
3306bool DOTGraphTraits<
3307 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3308 false;
3309
3310template <typename DerivedCCG, typename FuncTy, typename CallTy>
3311void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3312 std::string Label) const {
3313 WriteGraph(this, "", false, Label,
3314 DotFilePathPrefix + "ccg." + Label + ".dot");
3315}
3316
3317template <typename DerivedCCG, typename FuncTy, typename CallTy>
3318typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3319CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3320 const std::shared_ptr<ContextEdge> &Edge,
3321 DenseSet<uint32_t> ContextIdsToMove) {
3322 ContextNode *Node = Edge->Callee;
3323 assert(NodeToCallingFunc.count(Node));
3324 ContextNode *Clone =
3325 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3326 Node->addClone(Clone);
3327 Clone->MatchingCalls = Node->MatchingCalls;
3328 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3329 ContextIdsToMove);
3330 return Clone;
3331}
3332
3333template <typename DerivedCCG, typename FuncTy, typename CallTy>
3334void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3335 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3336 ContextNode *NewCallee, bool NewClone,
3337 DenseSet<uint32_t> ContextIdsToMove) {
3338 // NewCallee and Edge's current callee must be clones of the same original
3339 // node (Edge's current callee may be the original node too).
3340 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3341
3342 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3343
3344 ContextNode *OldCallee = Edge->Callee;
3345
3346 // We might already have an edge to the new callee from earlier cloning for a
3347 // different allocation. If one exists we will reuse it.
3348 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3349
3350 // Callers will pass an empty ContextIdsToMove set when they want to move the
3351 // edge. Copy in Edge's ids for simplicity.
3352 if (ContextIdsToMove.empty())
3353 ContextIdsToMove = Edge->getContextIds();
3354
3355 // If we are moving all of Edge's ids, then just move the whole Edge.
3356 // Otherwise only move the specified subset, to a new edge if needed.
3357 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3358 // First, update the alloc types on New Callee from Edge.
3359 // Do this before we potentially clear Edge's fields below!
3360 NewCallee->AllocTypes |= Edge->AllocTypes;
3361 // Moving the whole Edge.
3362 if (ExistingEdgeToNewCallee) {
3363 // Since we already have an edge to NewCallee, simply move the ids
3364 // onto it, and remove the existing Edge.
3365 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3366 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3367 assert(Edge->ContextIds == ContextIdsToMove);
3368 removeEdgeFromGraph(Edge.get());
3369 } else {
3370 // Otherwise just reconnect Edge to NewCallee.
3371 Edge->Callee = NewCallee;
3372 NewCallee->CallerEdges.push_back(Edge);
3373 // Remove it from callee where it was previously connected.
3374 OldCallee->eraseCallerEdge(Edge.get());
3375 // Don't need to update Edge's context ids since we are simply
3376 // reconnecting it.
3377 }
3378 } else {
3379 // Only moving a subset of Edge's ids.
3380 // Compute the alloc type of the subset of ids being moved.
3381 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3382 if (ExistingEdgeToNewCallee) {
3383 // Since we already have an edge to NewCallee, simply move the ids
3384 // onto it.
3385 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3386 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3387 } else {
3388 // Otherwise, create a new edge to NewCallee for the ids being moved.
3389 auto NewEdge = std::make_shared<ContextEdge>(
3390 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3391 Edge->Caller->CalleeEdges.push_back(NewEdge);
3392 NewCallee->CallerEdges.push_back(NewEdge);
3393 }
3394 // In either case, need to update the alloc types on NewCallee, and remove
3395 // those ids and update the alloc type on the original Edge.
3396 NewCallee->AllocTypes |= CallerEdgeAllocType;
3397 set_subtract(Edge->ContextIds, ContextIdsToMove);
3398 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3399 }
3400 // Now walk the old callee node's callee edges and move Edge's context ids
3401 // over to the corresponding edge into the clone (which is created here if
3402 // this is a newly created clone).
3403 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3404 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3405 // If this is a direct recursion edge, use NewCallee (the clone) as the
3406 // callee as well, so that any edge updated/created here is also direct
3407 // recursive.
3408 if (CalleeToUse == OldCallee) {
3409 // If this is a recursive edge, see if we already moved a recursive edge
3410 // (which would have to have been this one) - if we were only moving a
3411 // subset of context ids it would still be on OldCallee.
3412 if (EdgeIsRecursive) {
3413 assert(OldCalleeEdge == Edge);
3414 continue;
3415 }
3416 CalleeToUse = NewCallee;
3417 }
3418 // The context ids moving to the new callee are the subset of this edge's
3419 // context ids and the context ids on the caller edge being moved.
3420 DenseSet<uint32_t> EdgeContextIdsToMove =
3421 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3422 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3423 OldCalleeEdge->AllocTypes =
3424 computeAllocType(OldCalleeEdge->getContextIds());
3425 if (!NewClone) {
3426 // Update context ids / alloc type on corresponding edge to NewCallee.
3427 // There is a chance this may not exist if we are reusing an existing
3428 // clone, specifically during function assignment, where we would have
3429 // removed none type edges after creating the clone. If we can't find
3430 // a corresponding edge there, fall through to the cloning below.
3431 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3432 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3433 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3434 continue;
3435 }
3436 }
3437 auto NewEdge = std::make_shared<ContextEdge>(
3438 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3439 EdgeContextIdsToMove);
3440 NewCallee->CalleeEdges.push_back(NewEdge);
3441 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3442 }
3443 // Recompute the node alloc type now that its callee edges have been
3444 // updated (since we will compute from those edges).
3445 OldCallee->AllocTypes = OldCallee->computeAllocType();
3446 // OldCallee alloc type should be None iff its context id set is now empty.
3447 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3448 OldCallee->emptyContextIds());
3449 if (VerifyCCG) {
3450 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3451 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3452 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3453 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3454 /*CheckEdges=*/false);
3455 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3456 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3457 /*CheckEdges=*/false);
3458 }
3459}
3460
3461template <typename DerivedCCG, typename FuncTy, typename CallTy>
3462void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3463 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3464 ContextNode *NewCaller) {
3465 auto *OldCallee = Edge->Callee;
3466 auto *NewCallee = OldCallee;
3467 // If this edge was direct recursive, make any new/updated edge also direct
3468 // recursive to NewCaller.
3469 bool Recursive = Edge->Caller == Edge->Callee;
3470 if (Recursive)
3471 NewCallee = NewCaller;
3472
3473 ContextNode *OldCaller = Edge->Caller;
3474 OldCaller->eraseCalleeEdge(Edge.get());
3475
3476 // We might already have an edge to the new caller. If one exists we will
3477 // reuse it.
3478 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3479
3480 if (ExistingEdgeToNewCaller) {
3481 // Since we already have an edge to NewCaller, simply move the ids
3482 // onto it, and remove the existing Edge.
3483 ExistingEdgeToNewCaller->getContextIds().insert_range(
3484 Edge->getContextIds());
3485 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3486 Edge->ContextIds.clear();
3487 Edge->AllocTypes = (uint8_t)AllocationType::None;
3488 OldCallee->eraseCallerEdge(Edge.get());
3489 } else {
3490 // Otherwise just reconnect Edge to NewCaller.
3491 Edge->Caller = NewCaller;
3492 NewCaller->CalleeEdges.push_back(Edge);
3493 if (Recursive) {
3494 assert(NewCallee == NewCaller);
3495 // In the case of (direct) recursive edges, we update the callee as well
3496 // so that it becomes recursive on the new caller.
3497 Edge->Callee = NewCallee;
3498 NewCallee->CallerEdges.push_back(Edge);
3499 OldCallee->eraseCallerEdge(Edge.get());
3500 }
3501 // Don't need to update Edge's context ids since we are simply
3502 // reconnecting it.
3503 }
3504 // In either case, need to update the alloc types on New Caller.
3505 NewCaller->AllocTypes |= Edge->AllocTypes;
3506
3507 // Now walk the old caller node's caller edges and move Edge's context ids
3508 // over to the corresponding edge into the node (which is created here if
3509 // this is a newly created node). We can tell whether this is a newly created
3510 // node by seeing if it has any caller edges yet.
3511#ifndef NDEBUG
3512 bool IsNewNode = NewCaller->CallerEdges.empty();
3513#endif
3514 // If we just moved a direct recursive edge, presumably its context ids should
3515 // also flow out of OldCaller via some other non-recursive callee edge. We
3516 // don't want to remove the recursive context ids from other caller edges yet,
3517 // otherwise the context ids get into an inconsistent state on OldCaller.
3518 // We will update these context ids on the non-recursive caller edge when and
3519 // if they are updated on the non-recursive callee.
3520 if (!Recursive) {
3521 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3522 auto OldCallerCaller = OldCallerEdge->Caller;
3523 // The context ids moving to the new caller are the subset of this edge's
3524 // context ids and the context ids on the callee edge being moved.
3525 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3526 OldCallerEdge->getContextIds(), Edge->getContextIds());
3527 if (OldCaller == OldCallerCaller) {
3528 OldCallerCaller = NewCaller;
3529 // Don't actually move this one. The caller will move it directly via a
3530 // call to this function with this as the Edge if it is appropriate to
3531 // move to a diff node that has a matching callee (itself).
3532 continue;
3533 }
3534 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3535 OldCallerEdge->AllocTypes =
3536 computeAllocType(OldCallerEdge->getContextIds());
3537 // In this function we expect that any pre-existing node already has edges
3538 // from the same callers as the old node. That should be true in the
3539 // current use case, where we will remove None-type edges after copying
3540 // over all caller edges from the callee.
3541 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3542 // Since we would have skipped caller edges when moving a direct recursive
3543 // edge, this may not hold true when recursive handling enabled.
3544 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3545 if (ExistingCallerEdge) {
3546 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3547 ExistingCallerEdge->AllocTypes |=
3548 computeAllocType(EdgeContextIdsToMove);
3549 continue;
3550 }
3551 auto NewEdge = std::make_shared<ContextEdge>(
3552 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3553 EdgeContextIdsToMove);
3554 NewCaller->CallerEdges.push_back(NewEdge);
3555 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3556 }
3557 }
3558 // Recompute the node alloc type now that its caller edges have been
3559 // updated (since we will compute from those edges).
3560 OldCaller->AllocTypes = OldCaller->computeAllocType();
3561 // OldCaller alloc type should be None iff its context id set is now empty.
3562 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3563 OldCaller->emptyContextIds());
3564 if (VerifyCCG) {
3565 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3566 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3567 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3568 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3569 /*CheckEdges=*/false);
3570 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3571 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3572 /*CheckEdges=*/false);
3573 }
3574}
3575
3576template <typename DerivedCCG, typename FuncTy, typename CallTy>
3577void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3578 recursivelyRemoveNoneTypeCalleeEdges(
3579 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3580 auto Inserted = Visited.insert(Node);
3581 if (!Inserted.second)
3582 return;
3583
3584 removeNoneTypeCalleeEdges(Node);
3585
3586 for (auto *Clone : Node->Clones)
3587 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3588
3589 // The recursive call may remove some of this Node's caller edges.
3590 // Iterate over a copy and skip any that were removed.
3591 auto CallerEdges = Node->CallerEdges;
3592 for (auto &Edge : CallerEdges) {
3593 // Skip any that have been removed by an earlier recursive call.
3594 if (Edge->isRemoved()) {
3595 assert(!is_contained(Node->CallerEdges, Edge));
3596 continue;
3597 }
3598 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3599 }
3600}
3601
3602// This is the standard DFS based backedge discovery algorithm.
3603template <typename DerivedCCG, typename FuncTy, typename CallTy>
3604void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3605 // If we are cloning recursive contexts, find and mark backedges from all root
3606 // callers, using the typical DFS based backedge analysis.
3608 return;
3609 DenseSet<const ContextNode *> Visited;
3610 DenseSet<const ContextNode *> CurrentStack;
3611 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3612 auto *Node = Entry.second;
3613 if (Node->isRemoved())
3614 continue;
3615 // It is a root if it doesn't have callers.
3616 if (!Node->CallerEdges.empty())
3617 continue;
3618 markBackedges(Node, Visited, CurrentStack);
3619 assert(CurrentStack.empty());
3620 }
3621}
3622
3623// Recursive helper for above markBackedges method.
3624template <typename DerivedCCG, typename FuncTy, typename CallTy>
3625void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3626 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3627 DenseSet<const ContextNode *> &CurrentStack) {
3628 auto I = Visited.insert(Node);
3629 // We should only call this for unvisited nodes.
3630 assert(I.second);
3631 (void)I;
3632 for (auto &CalleeEdge : Node->CalleeEdges) {
3633 auto *Callee = CalleeEdge->Callee;
3634 if (Visited.count(Callee)) {
3635 // Since this was already visited we need to check if it is currently on
3636 // the recursive stack in which case it is a backedge.
3637 if (CurrentStack.count(Callee))
3638 CalleeEdge->IsBackedge = true;
3639 continue;
3640 }
3641 CurrentStack.insert(Callee);
3642 markBackedges(Callee, Visited, CurrentStack);
3643 CurrentStack.erase(Callee);
3644 }
3645}
3646
3647template <typename DerivedCCG, typename FuncTy, typename CallTy>
3648void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3649 DenseSet<const ContextNode *> Visited;
3650 for (auto &Entry : AllocationCallToContextNodeMap) {
3651 Visited.clear();
3652 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3653 }
3654 Visited.clear();
3655 for (auto &Entry : AllocationCallToContextNodeMap)
3656 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3657 if (VerifyCCG)
3658 check();
3659}
3660
3661// helper function to check an AllocType is cold or notcold or both.
3668
3669template <typename DerivedCCG, typename FuncTy, typename CallTy>
3670void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3671 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3672 const DenseSet<uint32_t> &AllocContextIds) {
3673 if (VerifyNodes)
3674 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3675 assert(!Node->CloneOf);
3676
3677 // If Node as a null call, then either it wasn't found in the module (regular
3678 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3679 // cloning (e.g. recursion, calls multiple targets, etc).
3680 // Do this here so that we don't try to recursively clone callers below, which
3681 // isn't useful at least for this node.
3682 if (!Node->hasCall())
3683 return;
3684
3685 // No need to look at any callers if allocation type already unambiguous.
3686 if (hasSingleAllocType(Node->AllocTypes))
3687 return;
3688
3689#ifndef NDEBUG
3690 auto Insert =
3691#endif
3692 Visited.insert(Node);
3693 // We should not have visited this node yet.
3694 assert(Insert.second);
3695 // The recursive call to identifyClones may delete the current edge from the
3696 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3697 // in an iterator and having recursive call erase from it. Other edges may
3698 // also get removed during the recursion, which will have null Callee and
3699 // Caller pointers (and are deleted later), so we skip those below.
3700 {
3701 auto CallerEdges = Node->CallerEdges;
3702 for (auto &Edge : CallerEdges) {
3703 // Skip any that have been removed by an earlier recursive call.
3704 if (Edge->isRemoved()) {
3705 assert(!is_contained(Node->CallerEdges, Edge));
3706 continue;
3707 }
3708 // Defer backedges. See comments further below where these edges are
3709 // handled during the cloning of this Node.
3710 if (Edge->IsBackedge) {
3711 // We should only mark these if cloning recursive contexts, where we
3712 // need to do this deferral.
3714 continue;
3715 }
3716 // Ignore any caller we previously visited via another edge.
3717 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3718 identifyClones(Edge->Caller, Visited, AllocContextIds);
3719 }
3720 }
3721 }
3722
3723 // Check if we reached an unambiguous call or have have only a single caller.
3724 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3725 return;
3726
3727 // We need to clone.
3728
3729 // Try to keep the original version as alloc type NotCold. This will make
3730 // cases with indirect calls or any other situation with an unknown call to
3731 // the original function get the default behavior. We do this by sorting the
3732 // CallerEdges of the Node we will clone by alloc type.
3733 //
3734 // Give NotCold edge the lowest sort priority so those edges are at the end of
3735 // the caller edges vector, and stay on the original version (since the below
3736 // code clones greedily until it finds all remaining edges have the same type
3737 // and leaves the remaining ones on the original Node).
3738 //
3739 // We shouldn't actually have any None type edges, so the sorting priority for
3740 // that is arbitrary, and we assert in that case below.
3741 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3742 /*Cold*/ 1,
3743 /*NotColdCold*/ 2};
3744 llvm::stable_sort(Node->CallerEdges,
3745 [&](const std::shared_ptr<ContextEdge> &A,
3746 const std::shared_ptr<ContextEdge> &B) {
3747 // Nodes with non-empty context ids should be sorted
3748 // before those with empty context ids.
3749 if (A->ContextIds.empty())
3750 // Either B ContextIds are non-empty (in which case we
3751 // should return false because B < A), or B ContextIds
3752 // are empty, in which case they are equal, and we
3753 // should maintain the original relative ordering.
3754 return false;
3755 if (B->ContextIds.empty())
3756 return true;
3757
3758 if (A->AllocTypes == B->AllocTypes)
3759 // Use the first context id for each edge as a
3760 // tie-breaker.
3761 return *A->ContextIds.begin() < *B->ContextIds.begin();
3762 return AllocTypeCloningPriority[A->AllocTypes] <
3763 AllocTypeCloningPriority[B->AllocTypes];
3764 });
3765
3766 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3767
3768 DenseSet<uint32_t> RecursiveContextIds;
3770 // If we are allowing recursive callsites, but have also disabled recursive
3771 // contexts, look for context ids that show up in multiple caller edges.
3773 DenseSet<uint32_t> AllCallerContextIds;
3774 for (auto &CE : Node->CallerEdges) {
3775 // Resize to the largest set of caller context ids, since we know the
3776 // final set will be at least that large.
3777 AllCallerContextIds.reserve(CE->getContextIds().size());
3778 for (auto Id : CE->getContextIds())
3779 if (!AllCallerContextIds.insert(Id).second)
3780 RecursiveContextIds.insert(Id);
3781 }
3782 }
3783
3784 // Iterate until we find no more opportunities for disambiguating the alloc
3785 // types via cloning. In most cases this loop will terminate once the Node
3786 // has a single allocation type, in which case no more cloning is needed.
3787 // Iterate over a copy of Node's caller edges, since we may need to remove
3788 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3789 // makes it less error-prone.
3790 auto CallerEdges = Node->CallerEdges;
3791 for (auto &CallerEdge : CallerEdges) {
3792 // Skip any that have been removed by an earlier recursive call.
3793 if (CallerEdge->isRemoved()) {
3794 assert(!is_contained(Node->CallerEdges, CallerEdge));
3795 continue;
3796 }
3797 assert(CallerEdge->Callee == Node);
3798
3799 // See if cloning the prior caller edge left this node with a single alloc
3800 // type or a single caller. In that case no more cloning of Node is needed.
3801 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3802 break;
3803
3804 // If the caller was not successfully matched to a call in the IR/summary,
3805 // there is no point in trying to clone for it as we can't update that call.
3806 if (!CallerEdge->Caller->hasCall())
3807 continue;
3808
3809 // Only need to process the ids along this edge pertaining to the given
3810 // allocation.
3811 auto CallerEdgeContextsForAlloc =
3812 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3813 if (!RecursiveContextIds.empty())
3814 CallerEdgeContextsForAlloc =
3815 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3816 if (CallerEdgeContextsForAlloc.empty())
3817 continue;
3818
3819 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3820
3821 // Compute the node callee edge alloc types corresponding to the context ids
3822 // for this caller edge.
3823 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3824 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3825 for (auto &CalleeEdge : Node->CalleeEdges)
3826 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3827 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3828
3829 // Don't clone if doing so will not disambiguate any alloc types amongst
3830 // caller edges (including the callee edges that would be cloned).
3831 // Otherwise we will simply move all edges to the clone.
3832 //
3833 // First check if by cloning we will disambiguate the caller allocation
3834 // type from node's allocation type. Query allocTypeToUse so that we don't
3835 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3836 // neither of these should be None type.
3837 //
3838 // Then check if by cloning node at least one of the callee edges will be
3839 // disambiguated by splitting out different context ids.
3840 //
3841 // However, always do the cloning if this is a backedge, in which case we
3842 // have not yet cloned along this caller edge.
3843 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3844 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3845 if (!CallerEdge->IsBackedge &&
3846 allocTypeToUse(CallerAllocTypeForAlloc) ==
3847 allocTypeToUse(Node->AllocTypes) &&
3848 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3849 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3850 continue;
3851 }
3852
3853 if (CallerEdge->IsBackedge) {
3854 // We should only mark these if cloning recursive contexts, where we
3855 // need to do this deferral.
3857 DeferredBackedges++;
3858 }
3859
3860 // If this is a backedge, we now do recursive cloning starting from its
3861 // caller since we may have moved unambiguous caller contexts to a clone
3862 // of this Node in a previous iteration of the current loop, giving more
3863 // opportunity for cloning through the backedge. Because we sorted the
3864 // caller edges earlier so that cold caller edges are first, we would have
3865 // visited and cloned this node for any unamibiguously cold non-recursive
3866 // callers before any ambiguous backedge callers. Note that we don't do this
3867 // if the caller is already cloned or visited during cloning (e.g. via a
3868 // different context path from the allocation).
3869 // TODO: Can we do better in the case where the caller was already visited?
3870 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3871 !Visited.count(CallerEdge->Caller)) {
3872 const auto OrigIdCount = CallerEdge->getContextIds().size();
3873 // Now do the recursive cloning of this backedge's caller, which was
3874 // deferred earlier.
3875 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3876 removeNoneTypeCalleeEdges(CallerEdge->Caller);
3877 // See if the recursive call to identifyClones moved the context ids to a
3878 // new edge from this node to a clone of caller, and switch to looking at
3879 // that new edge so that we clone Node for the new caller clone.
3880 bool UpdatedEdge = false;
3881 if (OrigIdCount > CallerEdge->getContextIds().size()) {
3882 for (auto E : Node->CallerEdges) {
3883 // Only interested in clones of the current edges caller.
3884 if (E->Caller->CloneOf != CallerEdge->Caller)
3885 continue;
3886 // See if this edge contains any of the context ids originally on the
3887 // current caller edge.
3888 auto CallerEdgeContextsForAllocNew =
3889 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3890 if (CallerEdgeContextsForAllocNew.empty())
3891 continue;
3892 // Make sure we don't pick a previously existing caller edge of this
3893 // Node, which would be processed on a different iteration of the
3894 // outer loop over the saved CallerEdges.
3895 if (llvm::is_contained(CallerEdges, E))
3896 continue;
3897 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3898 // are updated further below for all cases where we just invoked
3899 // identifyClones recursively.
3900 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3901 CallerEdge = E;
3902 UpdatedEdge = true;
3903 break;
3904 }
3905 }
3906 // If cloning removed this edge (and we didn't update it to a new edge
3907 // above), we're done with this edge. It's possible we moved all of the
3908 // context ids to an existing clone, in which case there's no need to do
3909 // further processing for them.
3910 if (CallerEdge->isRemoved())
3911 continue;
3912
3913 // Now we need to update the information used for the cloning decisions
3914 // further below, as we may have modified edges and their context ids.
3915
3916 // Note if we changed the CallerEdge above we would have already updated
3917 // the context ids.
3918 if (!UpdatedEdge) {
3919 CallerEdgeContextsForAlloc = set_intersection(
3920 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3921 if (CallerEdgeContextsForAlloc.empty())
3922 continue;
3923 }
3924 // Update the other information that depends on the edges and on the now
3925 // updated CallerEdgeContextsForAlloc.
3926 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3927 CalleeEdgeAllocTypesForCallerEdge.clear();
3928 for (auto &CalleeEdge : Node->CalleeEdges) {
3929 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3930 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3931 }
3932 }
3933
3934 // First see if we can use an existing clone. Check each clone and its
3935 // callee edges for matching alloc types.
3936 ContextNode *Clone = nullptr;
3937 for (auto *CurClone : Node->Clones) {
3938 if (allocTypeToUse(CurClone->AllocTypes) !=
3939 allocTypeToUse(CallerAllocTypeForAlloc))
3940 continue;
3941
3942 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3943 hasSingleAllocType(CallerAllocTypeForAlloc);
3944 // The above check should mean that if both have single alloc types that
3945 // they should be equal.
3946 assert(!BothSingleAlloc ||
3947 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3948
3949 // If either both have a single alloc type (which are the same), or if the
3950 // clone's callee edges have the same alloc types as those for the current
3951 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3952 // then we can reuse this clone.
3953 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3954 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3955 Clone = CurClone;
3956 break;
3957 }
3958 }
3959
3960 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3961 if (Clone)
3962 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3963 CallerEdgeContextsForAlloc);
3964 else
3965 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3966
3967 // Sanity check that no alloc types on clone or its edges are None.
3968 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3969 }
3970
3971 // We should still have some context ids on the original Node.
3972 assert(!Node->emptyContextIds());
3973
3974 // Sanity check that no alloc types on node or edges are None.
3975 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3976
3977 if (VerifyNodes)
3978 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3979}
3980
3981void ModuleCallsiteContextGraph::updateAllocationCall(
3982 CallInfo &Call, AllocationType AllocType) {
3983 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3985 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3986 "memprof", AllocTypeString);
3987 cast<CallBase>(Call.call())->addFnAttr(A);
3988 OREGetter(Call.call()->getFunction())
3989 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3990 << ore::NV("AllocationCall", Call.call()) << " in clone "
3991 << ore::NV("Caller", Call.call()->getFunction())
3992 << " marked with memprof allocation attribute "
3993 << ore::NV("Attribute", AllocTypeString));
3994}
3995
3996void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3998 auto *AI = cast<AllocInfo *>(Call.call());
3999 assert(AI);
4000 assert(AI->Versions.size() > Call.cloneNo());
4001 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4002}
4003
4005ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4006 const auto *CB = cast<CallBase>(Call.call());
4007 if (!CB->getAttributes().hasFnAttr("memprof"))
4008 return AllocationType::None;
4009 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4010 ? AllocationType::Cold
4011 : AllocationType::NotCold;
4012}
4013
4015IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4016 const auto *AI = cast<AllocInfo *>(Call.call());
4017 assert(AI->Versions.size() > Call.cloneNo());
4018 return (AllocationType)AI->Versions[Call.cloneNo()];
4019}
4020
4021void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4022 FuncInfo CalleeFunc) {
4023 auto *CurF = getCalleeFunc(CallerCall.call());
4024 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4025 if (isMemProfClone(*CurF)) {
4026 // If we already assigned this callsite to call a specific non-default
4027 // clone (i.e. not the original function which is clone 0), ensure that we
4028 // aren't trying to now update it to call a different clone, which is
4029 // indicative of a bug in the graph or function assignment.
4030 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4031 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4032 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4033 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4034 << "\n");
4035 MismatchedCloneAssignments++;
4036 }
4037 }
4038 if (NewCalleeCloneNo > 0)
4039 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4040 OREGetter(CallerCall.call()->getFunction())
4041 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4042 << ore::NV("Call", CallerCall.call()) << " in clone "
4043 << ore::NV("Caller", CallerCall.call()->getFunction())
4044 << " assigned to call function clone "
4045 << ore::NV("Callee", CalleeFunc.func()));
4046}
4047
4048void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4049 FuncInfo CalleeFunc) {
4050 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4051 assert(CI &&
4052 "Caller cannot be an allocation which should not have profiled calls");
4053 assert(CI->Clones.size() > CallerCall.cloneNo());
4054 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4055 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4056 // If we already assigned this callsite to call a specific non-default
4057 // clone (i.e. not the original function which is clone 0), ensure that we
4058 // aren't trying to now update it to call a different clone, which is
4059 // indicative of a bug in the graph or function assignment.
4060 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4061 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4062 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4063 << "\n");
4064 MismatchedCloneAssignments++;
4065 }
4066 CurCalleeCloneNo = NewCalleeCloneNo;
4067}
4068
4069// Update the debug information attached to NewFunc to use the clone Name. Note
4070// this needs to be done for both any existing DISubprogram for the definition,
4071// as well as any separate declaration DISubprogram.
4073 assert(Name == NewFunc->getName());
4074 auto *SP = NewFunc->getSubprogram();
4075 if (!SP)
4076 return;
4077 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4078 SP->replaceLinkageName(MDName);
4079 DISubprogram *Decl = SP->getDeclaration();
4080 if (!Decl)
4081 return;
4082 TempDISubprogram NewDecl = Decl->clone();
4083 NewDecl->replaceLinkageName(MDName);
4084 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4085}
4086
4087CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4088 Instruction *>::FuncInfo
4089ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4090 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4091 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4092 // Use existing LLVM facilities for cloning and obtaining Call in clone
4093 ValueToValueMapTy VMap;
4094 auto *NewFunc = CloneFunction(Func.func(), VMap);
4095 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4096 assert(!Func.func()->getParent()->getFunction(Name));
4097 NewFunc->setName(Name);
4098 updateSubprogramLinkageName(NewFunc, Name);
4099 for (auto &Inst : CallsWithMetadataInFunc) {
4100 // This map always has the initial version in it.
4101 assert(Inst.cloneNo() == 0);
4102 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4103 }
4104 OREGetter(Func.func())
4105 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4106 << "created clone " << ore::NV("NewFunction", NewFunc));
4107 return {NewFunc, CloneNo};
4108}
4109
4110CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4111 IndexCall>::FuncInfo
4112IndexCallsiteContextGraph::cloneFunctionForCallsite(
4113 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4114 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4115 // Check how many clones we have of Call (and therefore function).
4116 // The next clone number is the current size of versions array.
4117 // Confirm this matches the CloneNo provided by the caller, which is based on
4118 // the number of function clones we have.
4119 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4120 ? cast<AllocInfo *>(Call.call())->Versions.size()
4121 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4122 // Walk all the instructions in this function. Create a new version for
4123 // each (by adding an entry to the Versions/Clones summary array), and copy
4124 // over the version being called for the function clone being cloned here.
4125 // Additionally, add an entry to the CallMap for the new function clone,
4126 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4127 // to the new call clone.
4128 for (auto &Inst : CallsWithMetadataInFunc) {
4129 // This map always has the initial version in it.
4130 assert(Inst.cloneNo() == 0);
4131 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4132 assert(AI->Versions.size() == CloneNo);
4133 // We assign the allocation type later (in updateAllocationCall), just add
4134 // an entry for it here.
4135 AI->Versions.push_back(0);
4136 } else {
4137 auto *CI = cast<CallsiteInfo *>(Inst.call());
4138 assert(CI && CI->Clones.size() == CloneNo);
4139 // We assign the clone number later (in updateCall), just add an entry for
4140 // it here.
4141 CI->Clones.push_back(0);
4142 }
4143 CallMap[Inst] = {Inst.call(), CloneNo};
4144 }
4145 return {Func.func(), CloneNo};
4146}
4147
4148// We perform cloning for each allocation node separately. However, this
4149// sometimes results in a situation where the same node calls multiple
4150// clones of the same callee, created for different allocations. This
4151// causes issues when assigning functions to these clones, as each node can
4152// in reality only call a single callee clone.
4153//
4154// To address this, before assigning functions, merge callee clone nodes as
4155// needed using a post order traversal from the allocations. We attempt to
4156// use existing clones as the merge node when legal, and to share them
4157// among callers with the same properties (callers calling the same set of
4158// callee clone nodes for the same allocations).
4159//
4160// Without this fix, in some cases incorrect function assignment will lead
4161// to calling the wrong allocation clone.
4162template <typename DerivedCCG, typename FuncTy, typename CallTy>
4163void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4164 if (!MergeClones)
4165 return;
4166
4167 // Generate a map from context id to the associated allocation node for use
4168 // when merging clones.
4169 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4170 for (auto &Entry : AllocationCallToContextNodeMap) {
4171 auto *Node = Entry.second;
4172 for (auto Id : Node->getContextIds())
4173 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4174 for (auto *Clone : Node->Clones) {
4175 for (auto Id : Clone->getContextIds())
4176 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4177 }
4178 }
4179
4180 // Post order traversal starting from allocations to ensure each callsite
4181 // calls a single clone of its callee. Callee nodes that are clones of each
4182 // other are merged (via new merge nodes if needed) to achieve this.
4183 DenseSet<const ContextNode *> Visited;
4184 for (auto &Entry : AllocationCallToContextNodeMap) {
4185 auto *Node = Entry.second;
4186
4187 mergeClones(Node, Visited, ContextIdToAllocationNode);
4188
4189 // Make a copy so the recursive post order traversal that may create new
4190 // clones doesn't mess up iteration. Note that the recursive traversal
4191 // itself does not call mergeClones on any of these nodes, which are all
4192 // (clones of) allocations.
4193 auto Clones = Node->Clones;
4194 for (auto *Clone : Clones)
4195 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4196 }
4197
4198 if (DumpCCG) {
4199 dbgs() << "CCG after merging:\n";
4200 dbgs() << *this;
4201 }
4202 if (ExportToDot)
4203 exportToDot("aftermerge");
4204
4205 if (VerifyCCG) {
4206 check();
4207 }
4208}
4209
4210// Recursive helper for above mergeClones method.
4211template <typename DerivedCCG, typename FuncTy, typename CallTy>
4212void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4213 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4214 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4215 auto Inserted = Visited.insert(Node);
4216 if (!Inserted.second)
4217 return;
4218
4219 // Iteratively perform merging on this node to handle new caller nodes created
4220 // during the recursive traversal. We could do something more elegant such as
4221 // maintain a worklist, but this is a simple approach that doesn't cause a
4222 // measureable compile time effect, as most nodes don't have many caller
4223 // edges to check.
4224 bool FoundUnvisited = true;
4225 unsigned Iters = 0;
4226 while (FoundUnvisited) {
4227 Iters++;
4228 FoundUnvisited = false;
4229 // Make a copy since the recursive call may move a caller edge to a new
4230 // callee, messing up the iterator.
4231 auto CallerEdges = Node->CallerEdges;
4232 for (auto CallerEdge : CallerEdges) {
4233 // Skip any caller edge moved onto a different callee during recursion.
4234 if (CallerEdge->Callee != Node)
4235 continue;
4236 // If we found an unvisited caller, note that we should check the caller
4237 // edges again as mergeClones may add or change caller nodes.
4238 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4239 FoundUnvisited = true;
4240 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4241 }
4242 }
4243
4244 TotalMergeInvokes++;
4245 TotalMergeIters += Iters;
4246 if (Iters > MaxMergeIters)
4247 MaxMergeIters = Iters;
4248
4249 // Merge for this node after we handle its callers.
4250 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4251}
4252
4253template <typename DerivedCCG, typename FuncTy, typename CallTy>
4254void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4255 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4256 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4257 // Ignore Node if we moved all of its contexts to clones.
4258 if (Node->emptyContextIds())
4259 return;
4260
4261 // First identify groups of clones among Node's callee edges, by building
4262 // a map from each callee base node to the associated callee edges from Node.
4263 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4264 OrigNodeToCloneEdges;
4265 for (const auto &E : Node->CalleeEdges) {
4266 auto *Callee = E->Callee;
4267 if (!Callee->CloneOf && Callee->Clones.empty())
4268 continue;
4269 ContextNode *Base = Callee->getOrigNode();
4270 OrigNodeToCloneEdges[Base].push_back(E);
4271 }
4272
4273 // Helper for callee edge sorting below. Return true if A's callee has fewer
4274 // caller edges than B, or if A is a clone and B is not, or if A's first
4275 // context id is smaller than B's.
4276 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4277 const std::shared_ptr<ContextEdge> &B) {
4278 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4279 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4280 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4281 return true;
4282 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4283 return false;
4284 // Use the first context id for each edge as a
4285 // tie-breaker.
4286 return *A->ContextIds.begin() < *B->ContextIds.begin();
4287 };
4288
4289 // Process each set of callee clones called by Node, performing the needed
4290 // merging.
4291 for (auto Entry : OrigNodeToCloneEdges) {
4292 // CalleeEdges is the set of edges from Node reaching callees that are
4293 // mutual clones of each other.
4294 auto &CalleeEdges = Entry.second;
4295 auto NumCalleeClones = CalleeEdges.size();
4296 // A single edge means there is no merging needed.
4297 if (NumCalleeClones == 1)
4298 continue;
4299 // Sort the CalleeEdges calling this group of clones in ascending order of
4300 // their caller edge counts, putting the original non-clone node first in
4301 // cases of a tie. This simplifies finding an existing node to use as the
4302 // merge node.
4303 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4304
4305 /// Find other callers of the given set of callee edges that can
4306 /// share the same callee merge node. See the comments at this method
4307 /// definition for details.
4308 DenseSet<ContextNode *> OtherCallersToShareMerge;
4309 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4310 OtherCallersToShareMerge);
4311
4312 // Now do the actual merging. Identify existing or create a new MergeNode
4313 // during the first iteration. Move each callee over, along with edges from
4314 // other callers we've determined above can share the same merge node.
4315 ContextNode *MergeNode = nullptr;
4316 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4317 for (auto CalleeEdge : CalleeEdges) {
4318 auto *OrigCallee = CalleeEdge->Callee;
4319 // If we don't have a MergeNode yet (only happens on the first iteration,
4320 // as a new one will be created when we go to move the first callee edge
4321 // over as needed), see if we can use this callee.
4322 if (!MergeNode) {
4323 // If there are no other callers, simply use this callee.
4324 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4325 MergeNode = OrigCallee;
4326 NonNewMergedNodes++;
4327 continue;
4328 }
4329 // Otherwise, if we have identified other caller nodes that can share
4330 // the merge node with Node, see if all of OrigCallee's callers are
4331 // going to share the same merge node. In that case we can use callee
4332 // (since all of its callers would move to the new merge node).
4333 if (!OtherCallersToShareMerge.empty()) {
4334 bool MoveAllCallerEdges = true;
4335 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4336 if (CalleeCallerE == CalleeEdge)
4337 continue;
4338 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4339 MoveAllCallerEdges = false;
4340 break;
4341 }
4342 }
4343 // If we are going to move all callers over, we can use this callee as
4344 // the MergeNode.
4345 if (MoveAllCallerEdges) {
4346 MergeNode = OrigCallee;
4347 NonNewMergedNodes++;
4348 continue;
4349 }
4350 }
4351 }
4352 // Move this callee edge, creating a new merge node if necessary.
4353 if (MergeNode) {
4354 assert(MergeNode != OrigCallee);
4355 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4356 /*NewClone*/ false);
4357 } else {
4358 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4359 NewMergedNodes++;
4360 }
4361 // Now move all identified edges from other callers over to the merge node
4362 // as well.
4363 if (!OtherCallersToShareMerge.empty()) {
4364 // Make and iterate over a copy of OrigCallee's caller edges because
4365 // some of these will be moved off of the OrigCallee and that would mess
4366 // up the iteration from OrigCallee.
4367 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4368 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4369 if (CalleeCallerE == CalleeEdge)
4370 continue;
4371 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4372 continue;
4373 CallerToMoveCount[CalleeCallerE->Caller]++;
4374 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4375 /*NewClone*/ false);
4376 }
4377 }
4378 removeNoneTypeCalleeEdges(OrigCallee);
4379 removeNoneTypeCalleeEdges(MergeNode);
4380 }
4381 }
4382}
4383
4384// Look for other nodes that have edges to the same set of callee
4385// clones as the current Node. Those can share the eventual merge node
4386// (reducing cloning and binary size overhead) iff:
4387// - they have edges to the same set of callee clones
4388// - each callee edge reaches a subset of the same allocations as Node's
4389// corresponding edge to the same callee clone.
4390// The second requirement is to ensure that we don't undo any of the
4391// necessary cloning to distinguish contexts with different allocation
4392// behavior.
4393// FIXME: This is somewhat conservative, as we really just need to ensure
4394// that they don't reach the same allocations as contexts on edges from Node
4395// going to any of the *other* callee clones being merged. However, that
4396// requires more tracking and checking to get right.
4397template <typename DerivedCCG, typename FuncTy, typename CallTy>
4398void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4399 findOtherCallersToShareMerge(
4400 ContextNode *Node,
4401 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4402 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4403 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4404 auto NumCalleeClones = CalleeEdges.size();
4405 // This map counts how many edges to the same callee clone exist for other
4406 // caller nodes of each callee clone.
4407 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4408 // Counts the number of other caller nodes that have edges to all callee
4409 // clones that don't violate the allocation context checking.
4410 unsigned PossibleOtherCallerNodes = 0;
4411
4412 // We only need to look at other Caller nodes if the first callee edge has
4413 // multiple callers (recall they are sorted in ascending order above).
4414 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4415 return;
4416
4417 // For each callee edge:
4418 // - Collect the count of other caller nodes calling the same callees.
4419 // - Collect the alloc nodes reached by contexts on each callee edge.
4420 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4421 for (auto CalleeEdge : CalleeEdges) {
4422 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4423 // For each other caller of the same callee, increment the count of
4424 // edges reaching the same callee clone.
4425 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4426 if (CalleeCallerEdges->Caller == Node) {
4427 assert(CalleeCallerEdges == CalleeEdge);
4428 continue;
4429 }
4430 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4431 // If this caller edge now reaches all of the same callee clones,
4432 // increment the count of candidate other caller nodes.
4433 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4434 NumCalleeClones)
4435 PossibleOtherCallerNodes++;
4436 }
4437 // Collect the alloc nodes reached by contexts on each callee edge, for
4438 // later analysis.
4439 for (auto Id : CalleeEdge->getContextIds()) {
4440 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4441 if (!Alloc) {
4442 // FIXME: unclear why this happens occasionally, presumably
4443 // imperfect graph updates possibly with recursion.
4444 MissingAllocForContextId++;
4445 continue;
4446 }
4447 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4448 }
4449 }
4450
4451 // Now walk the callee edges again, and make sure that for each candidate
4452 // caller node all of its edges to the callees reach the same allocs (or
4453 // a subset) as those along the corresponding callee edge from Node.
4454 for (auto CalleeEdge : CalleeEdges) {
4455 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4456 // Stop if we do not have any (more) candidate other caller nodes.
4457 if (!PossibleOtherCallerNodes)
4458 break;
4459 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4460 // Check each other caller of this callee clone.
4461 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4462 // Not interested in the callee edge from Node itself.
4463 if (CalleeCallerE == CalleeEdge)
4464 continue;
4465 // Skip any callers that didn't have callee edges to all the same
4466 // callee clones.
4467 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4468 NumCalleeClones)
4469 continue;
4470 // Make sure that each context along edge from candidate caller node
4471 // reaches an allocation also reached by this callee edge from Node.
4472 for (auto Id : CalleeCallerE->getContextIds()) {
4473 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4474 if (!Alloc)
4475 continue;
4476 // If not, simply reset the map entry to 0 so caller is ignored, and
4477 // reduce the count of candidate other caller nodes.
4478 if (!CurCalleeAllocNodes.contains(Alloc)) {
4479 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4480 PossibleOtherCallerNodes--;
4481 break;
4482 }
4483 }
4484 }
4485 }
4486
4487 if (!PossibleOtherCallerNodes)
4488 return;
4489
4490 // Build the set of other caller nodes that can use the same callee merge
4491 // node.
4492 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4493 if (Count != NumCalleeClones)
4494 continue;
4495 OtherCallersToShareMerge.insert(OtherCaller);
4496 }
4497}
4498
4499// This method assigns cloned callsites to functions, cloning the functions as
4500// needed. The assignment is greedy and proceeds roughly as follows:
4501//
4502// For each function Func:
4503// For each call with graph Node having clones:
4504// Initialize ClonesWorklist to Node and its clones
4505// Initialize NodeCloneCount to 0
4506// While ClonesWorklist is not empty:
4507// Clone = pop front ClonesWorklist
4508// NodeCloneCount++
4509// If Func has been cloned less than NodeCloneCount times:
4510// If NodeCloneCount is 1:
4511// Assign Clone to original Func
4512// Continue
4513// Create a new function clone
4514// If other callers not assigned to call a function clone yet:
4515// Assign them to call new function clone
4516// Continue
4517// Assign any other caller calling the cloned version to new clone
4518//
4519// For each caller of Clone:
4520// If caller is assigned to call a specific function clone:
4521// If we cannot assign Clone to that function clone:
4522// Create new callsite Clone NewClone
4523// Add NewClone to ClonesWorklist
4524// Continue
4525// Assign Clone to existing caller's called function clone
4526// Else:
4527// If Clone not already assigned to a function clone:
4528// Assign to first function clone without assignment
4529// Assign caller to selected function clone
4530// For each call with graph Node having clones:
4531// If number func clones > number call's callsite Node clones:
4532// Record func CallInfo clones without Node clone in UnassignedCallClones
4533// For callsite Nodes in DFS order from allocations:
4534// If IsAllocation:
4535// Update allocation with alloc type
4536// Else:
4537// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4538// Update call to call recorded callee clone
4539//
4540template <typename DerivedCCG, typename FuncTy, typename CallTy>
4541bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4542 bool Changed = false;
4543
4544 mergeClones();
4545
4546 // Keep track of the assignment of nodes (callsites) to function clones they
4547 // call.
4548 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4549
4550 // Update caller node to call function version CalleeFunc, by recording the
4551 // assignment in CallsiteToCalleeFuncCloneMap.
4552 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4553 const FuncInfo &CalleeFunc) {
4554 assert(Caller->hasCall());
4555 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4556 };
4557
4558 // Information for a single clone of this Func.
4559 struct FuncCloneInfo {
4560 // The function clone.
4561 FuncInfo FuncClone;
4562 // Remappings of each call of interest (from original uncloned call to the
4563 // corresponding cloned call in this function clone).
4564 DenseMap<CallInfo, CallInfo> CallMap;
4565 };
4566
4567 // Map to keep track of information needed to update calls in function clones
4568 // when their corresponding callsite node was not itself cloned for that
4569 // function clone. Because of call context pruning (i.e. we only keep as much
4570 // caller information as needed to distinguish hot vs cold), we may not have
4571 // caller edges coming to each callsite node from all possible function
4572 // callers. A function clone may get created for other callsites in the
4573 // function for which there are caller edges that were not pruned. Any other
4574 // callsites in that function clone, which were not themselved cloned for
4575 // that function clone, should get updated the same way as the corresponding
4576 // callsite in the original function (which may call a clone of its callee).
4577 //
4578 // We build this map after completing function cloning for each function, so
4579 // that we can record the information from its call maps before they are
4580 // destructed. The map will be used as we update calls to update any still
4581 // unassigned call clones. Note that we may create new node clones as we clone
4582 // other functions, so later on we check which node clones were still not
4583 // created. To this end, the inner map is a map from function clone number to
4584 // the list of calls cloned for that function (can be more than one due to the
4585 // Node's MatchingCalls array).
4586 //
4587 // The alternative is creating new callsite clone nodes below as we clone the
4588 // function, but that is tricker to get right and likely more overhead.
4589 //
4590 // Inner map is a std::map so sorted by key (clone number), in order to get
4591 // ordered remarks in the full LTO case.
4592 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4593 UnassignedCallClones;
4594
4595 // Walk all functions for which we saw calls with memprof metadata, and handle
4596 // cloning for each of its calls.
4597 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4598 FuncInfo OrigFunc(Func);
4599 // Map from each clone number of OrigFunc to information about that function
4600 // clone (the function clone FuncInfo and call remappings). The index into
4601 // the vector is the clone number, as function clones are created and
4602 // numbered sequentially.
4603 std::vector<FuncCloneInfo> FuncCloneInfos;
4604 for (auto &Call : CallsWithMetadata) {
4605 ContextNode *Node = getNodeForInst(Call);
4606 // Skip call if we do not have a node for it (all uses of its stack ids
4607 // were either on inlined chains or pruned from the MIBs), or if we did
4608 // not create any clones for it.
4609 if (!Node || Node->Clones.empty())
4610 continue;
4611 assert(Node->hasCall() &&
4612 "Not having a call should have prevented cloning");
4613
4614 // Track the assignment of function clones to clones of the current
4615 // callsite Node being handled.
4616 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4617
4618 // Assign callsite version CallsiteClone to function version FuncClone,
4619 // and also assign (possibly cloned) Call to CallsiteClone.
4620 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4621 CallInfo &Call,
4622 ContextNode *CallsiteClone,
4623 bool IsAlloc) {
4624 // Record the clone of callsite node assigned to this function clone.
4625 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4626
4627 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4628 DenseMap<CallInfo, CallInfo> &CallMap =
4629 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4630 CallInfo CallClone(Call);
4631 if (auto It = CallMap.find(Call); It != CallMap.end())
4632 CallClone = It->second;
4633 CallsiteClone->setCall(CallClone);
4634 // Need to do the same for all matching calls.
4635 for (auto &MatchingCall : Node->MatchingCalls) {
4636 CallInfo CallClone(MatchingCall);
4637 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4638 CallClone = It->second;
4639 // Updates the call in the list.
4640 MatchingCall = CallClone;
4641 }
4642 };
4643
4644 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4645 // performs the necessary fixups (removing none type edges, and
4646 // importantly, propagating any function call assignment of the original
4647 // node to the new clone).
4648 auto MoveEdgeToNewCalleeCloneAndSetUp =
4649 [&](const std::shared_ptr<ContextEdge> &Edge) {
4650 ContextNode *OrigCallee = Edge->Callee;
4651 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4652 removeNoneTypeCalleeEdges(NewClone);
4653 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4654 // If the original Callee was already assigned to call a specific
4655 // function version, make sure its new clone is assigned to call
4656 // that same function clone.
4657 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4658 RecordCalleeFuncOfCallsite(
4659 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4660 return NewClone;
4661 };
4662
4663 // Keep track of the clones of callsite Node that need to be assigned to
4664 // function clones. This list may be expanded in the loop body below if we
4665 // find additional cloning is required.
4666 std::deque<ContextNode *> ClonesWorklist;
4667 // Ignore original Node if we moved all of its contexts to clones.
4668 if (!Node->emptyContextIds())
4669 ClonesWorklist.push_back(Node);
4670 llvm::append_range(ClonesWorklist, Node->Clones);
4671
4672 // Now walk through all of the clones of this callsite Node that we need,
4673 // and determine the assignment to a corresponding clone of the current
4674 // function (creating new function clones as needed).
4675 unsigned NodeCloneCount = 0;
4676 while (!ClonesWorklist.empty()) {
4677 ContextNode *Clone = ClonesWorklist.front();
4678 ClonesWorklist.pop_front();
4679 NodeCloneCount++;
4680 if (VerifyNodes)
4682
4683 // Need to create a new function clone if we have more callsite clones
4684 // than existing function clones, which would have been assigned to an
4685 // earlier clone in the list (we assign callsite clones to function
4686 // clones greedily).
4687 if (FuncCloneInfos.size() < NodeCloneCount) {
4688 // If this is the first callsite copy, assign to original function.
4689 if (NodeCloneCount == 1) {
4690 // Since FuncCloneInfos is empty in this case, no clones have
4691 // been created for this function yet, and no callers should have
4692 // been assigned a function clone for this callee node yet.
4694 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4695 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4696 }));
4697 // Initialize with empty call map, assign Clone to original function
4698 // and its callers, and skip to the next clone.
4699 FuncCloneInfos.push_back(
4700 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4701 AssignCallsiteCloneToFuncClone(
4702 OrigFunc, Call, Clone,
4703 AllocationCallToContextNodeMap.count(Call));
4704 for (auto &CE : Clone->CallerEdges) {
4705 // Ignore any caller that does not have a recorded callsite Call.
4706 if (!CE->Caller->hasCall())
4707 continue;
4708 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4709 }
4710 continue;
4711 }
4712
4713 // First locate which copy of OrigFunc to clone again. If a caller
4714 // of this callsite clone was already assigned to call a particular
4715 // function clone, we need to redirect all of those callers to the
4716 // new function clone, and update their other callees within this
4717 // function.
4718 FuncInfo PreviousAssignedFuncClone;
4719 auto EI = llvm::find_if(
4720 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4721 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4722 });
4723 bool CallerAssignedToCloneOfFunc = false;
4724 if (EI != Clone->CallerEdges.end()) {
4725 const std::shared_ptr<ContextEdge> &Edge = *EI;
4726 PreviousAssignedFuncClone =
4727 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4728 CallerAssignedToCloneOfFunc = true;
4729 }
4730
4731 // Clone function and save it along with the CallInfo map created
4732 // during cloning in the FuncCloneInfos.
4733 DenseMap<CallInfo, CallInfo> NewCallMap;
4734 unsigned CloneNo = FuncCloneInfos.size();
4735 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4736 "should already exist in the map");
4737 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4738 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
4739 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
4740 FunctionClonesAnalysis++;
4741 Changed = true;
4742
4743 // If no caller callsites were already assigned to a clone of this
4744 // function, we can simply assign this clone to the new func clone
4745 // and update all callers to it, then skip to the next clone.
4746 if (!CallerAssignedToCloneOfFunc) {
4747 AssignCallsiteCloneToFuncClone(
4748 NewFuncClone, Call, Clone,
4749 AllocationCallToContextNodeMap.count(Call));
4750 for (auto &CE : Clone->CallerEdges) {
4751 // Ignore any caller that does not have a recorded callsite Call.
4752 if (!CE->Caller->hasCall())
4753 continue;
4754 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4755 }
4756 continue;
4757 }
4758
4759 // We may need to do additional node cloning in this case.
4760 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4761 // that were previously assigned to call PreviousAssignedFuncClone,
4762 // to record that they now call NewFuncClone.
4763 // The none type edge removal may remove some of this Clone's caller
4764 // edges, if it is reached via another of its caller's callees.
4765 // Iterate over a copy and skip any that were removed.
4766 auto CallerEdges = Clone->CallerEdges;
4767 for (auto CE : CallerEdges) {
4768 // Skip any that have been removed on an earlier iteration.
4769 if (CE->isRemoved()) {
4770 assert(!is_contained(Clone->CallerEdges, CE));
4771 continue;
4772 }
4773 assert(CE);
4774 // Ignore any caller that does not have a recorded callsite Call.
4775 if (!CE->Caller->hasCall())
4776 continue;
4777
4778 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
4779 // We subsequently fall through to later handling that
4780 // will perform any additional cloning required for
4781 // callers that were calling other function clones.
4782 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4783 PreviousAssignedFuncClone)
4784 continue;
4785
4786 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4787
4788 // If we are cloning a function that was already assigned to some
4789 // callers, then essentially we are creating new callsite clones
4790 // of the other callsites in that function that are reached by those
4791 // callers. Clone the other callees of the current callsite's caller
4792 // that were already assigned to PreviousAssignedFuncClone
4793 // accordingly. This is important since we subsequently update the
4794 // calls from the nodes in the graph and their assignments to callee
4795 // functions recorded in CallsiteToCalleeFuncCloneMap.
4796 // The none type edge removal may remove some of this caller's
4797 // callee edges, if it is reached via another of its callees.
4798 // Iterate over a copy and skip any that were removed.
4799 auto CalleeEdges = CE->Caller->CalleeEdges;
4800 for (auto CalleeEdge : CalleeEdges) {
4801 // Skip any that have been removed on an earlier iteration when
4802 // cleaning up newly None type callee edges.
4803 if (CalleeEdge->isRemoved()) {
4804 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4805 continue;
4806 }
4807 assert(CalleeEdge);
4808 ContextNode *Callee = CalleeEdge->Callee;
4809 // Skip the current callsite, we are looking for other
4810 // callsites Caller calls, as well as any that does not have a
4811 // recorded callsite Call.
4812 if (Callee == Clone || !Callee->hasCall())
4813 continue;
4814 // Skip direct recursive calls. We don't need/want to clone the
4815 // caller node again, and this loop will not behave as expected if
4816 // we tried.
4817 if (Callee == CalleeEdge->Caller)
4818 continue;
4819 ContextNode *NewClone =
4820 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
4821 // Moving the edge may have resulted in some none type
4822 // callee edges on the original Callee.
4823 removeNoneTypeCalleeEdges(Callee);
4824 // Update NewClone with the new Call clone of this callsite's Call
4825 // created for the new function clone created earlier.
4826 // Recall that we have already ensured when building the graph
4827 // that each caller can only call callsites within the same
4828 // function, so we are guaranteed that Callee Call is in the
4829 // current OrigFunc.
4830 // CallMap is set up as indexed by original Call at clone 0.
4831 CallInfo OrigCall(Callee->getOrigNode()->Call);
4832 OrigCall.setCloneNo(0);
4833 DenseMap<CallInfo, CallInfo> &CallMap =
4834 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
4835 assert(CallMap.count(OrigCall));
4836 CallInfo NewCall(CallMap[OrigCall]);
4837 assert(NewCall);
4838 NewClone->setCall(NewCall);
4839 // Need to do the same for all matching calls.
4840 for (auto &MatchingCall : NewClone->MatchingCalls) {
4841 CallInfo OrigMatchingCall(MatchingCall);
4842 OrigMatchingCall.setCloneNo(0);
4843 assert(CallMap.count(OrigMatchingCall));
4844 CallInfo NewCall(CallMap[OrigMatchingCall]);
4845 assert(NewCall);
4846 // Updates the call in the list.
4847 MatchingCall = NewCall;
4848 }
4849 }
4850 }
4851 // Fall through to handling below to perform the recording of the
4852 // function for this callsite clone. This enables handling of cases
4853 // where the callers were assigned to different clones of a function.
4854 }
4855
4856 auto FindFirstAvailFuncClone = [&]() {
4857 // Find first function in FuncCloneInfos without an assigned
4858 // clone of this callsite Node. We should always have one
4859 // available at this point due to the earlier cloning when the
4860 // FuncCloneInfos size was smaller than the clone number.
4861 for (auto &CF : FuncCloneInfos) {
4862 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
4863 return CF.FuncClone;
4864 }
4866 "Expected an available func clone for this callsite clone");
4867 };
4868
4869 // See if we can use existing function clone. Walk through
4870 // all caller edges to see if any have already been assigned to
4871 // a clone of this callsite's function. If we can use it, do so. If not,
4872 // because that function clone is already assigned to a different clone
4873 // of this callsite, then we need to clone again.
4874 // Basically, this checking is needed to handle the case where different
4875 // caller functions/callsites may need versions of this function
4876 // containing different mixes of callsite clones across the different
4877 // callsites within the function. If that happens, we need to create
4878 // additional function clones to handle the various combinations.
4879 //
4880 // Keep track of any new clones of this callsite created by the
4881 // following loop, as well as any existing clone that we decided to
4882 // assign this clone to.
4883 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4884 FuncInfo FuncCloneAssignedToCurCallsiteClone;
4885 // Iterate over a copy of Clone's caller edges, since we may need to
4886 // remove edges in the moveEdgeTo* methods, and this simplifies the
4887 // handling and makes it less error-prone.
4888 auto CloneCallerEdges = Clone->CallerEdges;
4889 for (auto &Edge : CloneCallerEdges) {
4890 // Skip removed edges (due to direct recursive edges updated when
4891 // updating callee edges when moving an edge and subsequently
4892 // removed by call to removeNoneTypeCalleeEdges on the Clone).
4893 if (Edge->isRemoved())
4894 continue;
4895 // Ignore any caller that does not have a recorded callsite Call.
4896 if (!Edge->Caller->hasCall())
4897 continue;
4898 // If this caller already assigned to call a version of OrigFunc, need
4899 // to ensure we can assign this callsite clone to that function clone.
4900 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4901 FuncInfo FuncCloneCalledByCaller =
4902 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4903 // First we need to confirm that this function clone is available
4904 // for use by this callsite node clone.
4905 //
4906 // While FuncCloneToCurNodeCloneMap is built only for this Node and
4907 // its callsite clones, one of those callsite clones X could have
4908 // been assigned to the same function clone called by Edge's caller
4909 // - if Edge's caller calls another callsite within Node's original
4910 // function, and that callsite has another caller reaching clone X.
4911 // We need to clone Node again in this case.
4912 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4913 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4914 Clone) ||
4915 // Detect when we have multiple callers of this callsite that
4916 // have already been assigned to specific, and different, clones
4917 // of OrigFunc (due to other unrelated callsites in Func they
4918 // reach via call contexts). Is this Clone of callsite Node
4919 // assigned to a different clone of OrigFunc? If so, clone Node
4920 // again.
4921 (FuncCloneAssignedToCurCallsiteClone &&
4922 FuncCloneAssignedToCurCallsiteClone !=
4923 FuncCloneCalledByCaller)) {
4924 // We need to use a different newly created callsite clone, in
4925 // order to assign it to another new function clone on a
4926 // subsequent iteration over the Clones array (adjusted below).
4927 // Note we specifically do not reset the
4928 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
4929 // when this new clone is processed later we know which version of
4930 // the function to copy (so that other callsite clones we have
4931 // assigned to that function clone are properly cloned over). See
4932 // comments in the function cloning handling earlier.
4933
4934 // Check if we already have cloned this callsite again while
4935 // walking through caller edges, for a caller calling the same
4936 // function clone. If so, we can move this edge to that new clone
4937 // rather than creating yet another new clone.
4938 if (FuncCloneToNewCallsiteCloneMap.count(
4939 FuncCloneCalledByCaller)) {
4940 ContextNode *NewClone =
4941 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4942 moveEdgeToExistingCalleeClone(Edge, NewClone);
4943 // Cleanup any none type edges cloned over.
4944 removeNoneTypeCalleeEdges(NewClone);
4945 } else {
4946 // Create a new callsite clone.
4947 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
4948 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4949 NewClone;
4950 // Add to list of clones and process later.
4951 ClonesWorklist.push_back(NewClone);
4952 }
4953 // Moving the caller edge may have resulted in some none type
4954 // callee edges.
4955 removeNoneTypeCalleeEdges(Clone);
4956 // We will handle the newly created callsite clone in a subsequent
4957 // iteration over this Node's Clones.
4958 continue;
4959 }
4960
4961 // Otherwise, we can use the function clone already assigned to this
4962 // caller.
4963 if (!FuncCloneAssignedToCurCallsiteClone) {
4964 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4965 // Assign Clone to FuncCloneCalledByCaller
4966 AssignCallsiteCloneToFuncClone(
4967 FuncCloneCalledByCaller, Call, Clone,
4968 AllocationCallToContextNodeMap.count(Call));
4969 } else
4970 // Don't need to do anything - callsite is already calling this
4971 // function clone.
4972 assert(FuncCloneAssignedToCurCallsiteClone ==
4973 FuncCloneCalledByCaller);
4974
4975 } else {
4976 // We have not already assigned this caller to a version of
4977 // OrigFunc. Do the assignment now.
4978
4979 // First check if we have already assigned this callsite clone to a
4980 // clone of OrigFunc for another caller during this iteration over
4981 // its caller edges.
4982 if (!FuncCloneAssignedToCurCallsiteClone) {
4983 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
4984 assert(FuncCloneAssignedToCurCallsiteClone);
4985 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4986 AssignCallsiteCloneToFuncClone(
4987 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4988 AllocationCallToContextNodeMap.count(Call));
4989 } else
4990 assert(FuncCloneToCurNodeCloneMap
4991 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4992 // Update callers to record function version called.
4993 RecordCalleeFuncOfCallsite(Edge->Caller,
4994 FuncCloneAssignedToCurCallsiteClone);
4995 }
4996 }
4997 // If we didn't assign a function clone to this callsite clone yet, e.g.
4998 // none of its callers has a non-null call, do the assignment here.
4999 // We want to ensure that every callsite clone is assigned to some
5000 // function clone, so that the call updates below work as expected.
5001 // In particular if this is the original callsite, we want to ensure it
5002 // is assigned to the original function, otherwise the original function
5003 // will appear available for assignment to other callsite clones,
5004 // leading to unintended effects. For one, the unknown and not updated
5005 // callers will call into cloned paths leading to the wrong hints,
5006 // because they still call the original function (clone 0). Also,
5007 // because all callsites start out as being clone 0 by default, we can't
5008 // easily distinguish between callsites explicitly assigned to clone 0
5009 // vs those never assigned, which can lead to multiple updates of the
5010 // calls when invoking updateCall below, with mismatched clone values.
5011 // TODO: Add a flag to the callsite nodes or some other mechanism to
5012 // better distinguish and identify callsite clones that are not getting
5013 // assigned to function clones as expected.
5014 if (!FuncCloneAssignedToCurCallsiteClone) {
5015 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5016 assert(FuncCloneAssignedToCurCallsiteClone &&
5017 "No available func clone for this callsite clone");
5018 AssignCallsiteCloneToFuncClone(
5019 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5020 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5021 }
5022 }
5023 if (VerifyCCG) {
5025 for (const auto &PE : Node->CalleeEdges)
5027 for (const auto &CE : Node->CallerEdges)
5029 for (auto *Clone : Node->Clones) {
5031 for (const auto &PE : Clone->CalleeEdges)
5033 for (const auto &CE : Clone->CallerEdges)
5035 }
5036 }
5037 }
5038
5039 if (FuncCloneInfos.size() < 2)
5040 continue;
5041
5042 // In this case there is more than just the original function copy.
5043 // Record call clones of any callsite nodes in the function that did not
5044 // themselves get cloned for all of the function clones.
5045 for (auto &Call : CallsWithMetadata) {
5046 ContextNode *Node = getNodeForInst(Call);
5047 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5048 continue;
5049 // If Node has enough clones already to cover all function clones, we can
5050 // skip it. Need to add one for the original copy.
5051 // Use >= in case there were clones that were skipped due to having empty
5052 // context ids
5053 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5054 continue;
5055 // First collect all function clones we cloned this callsite node for.
5056 // They may not be sequential due to empty clones e.g.
5057 DenseSet<unsigned> NodeCallClones;
5058 for (auto *C : Node->Clones)
5059 NodeCallClones.insert(C->Call.cloneNo());
5060 unsigned I = 0;
5061 // Now check all the function clones.
5062 for (auto &FC : FuncCloneInfos) {
5063 // Function clones should be sequential.
5064 assert(FC.FuncClone.cloneNo() == I);
5065 // Skip the first clone which got the original call.
5066 // Also skip any other clones created for this Node.
5067 if (++I == 1 || NodeCallClones.contains(I)) {
5068 continue;
5069 }
5070 // Record the call clones created for this callsite in this function
5071 // clone.
5072 auto &CallVector = UnassignedCallClones[Node][I];
5073 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5074 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5075 CallInfo CallClone = It->second;
5076 CallVector.push_back(CallClone);
5077 } else {
5078 // All but the original clone (skipped earlier) should have an entry
5079 // for all calls.
5080 assert(false && "Expected to find call in CallMap");
5081 }
5082 // Need to do the same for all matching calls.
5083 for (auto &MatchingCall : Node->MatchingCalls) {
5084 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5085 CallInfo CallClone = It->second;
5086 CallVector.push_back(CallClone);
5087 } else {
5088 // All but the original clone (skipped earlier) should have an entry
5089 // for all calls.
5090 assert(false && "Expected to find call in CallMap");
5091 }
5092 }
5093 }
5094 }
5095 }
5096
5097 uint8_t BothTypes =
5098 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5099
5100 auto UpdateCalls = [&](ContextNode *Node,
5101 DenseSet<const ContextNode *> &Visited,
5102 auto &&UpdateCalls) {
5103 auto Inserted = Visited.insert(Node);
5104 if (!Inserted.second)
5105 return;
5106
5107 for (auto *Clone : Node->Clones)
5108 UpdateCalls(Clone, Visited, UpdateCalls);
5109
5110 for (auto &Edge : Node->CallerEdges)
5111 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5112
5113 // Skip if either no call to update, or if we ended up with no context ids
5114 // (we moved all edges onto other clones).
5115 if (!Node->hasCall() || Node->emptyContextIds())
5116 return;
5117
5118 if (Node->IsAllocation) {
5119 auto AT = allocTypeToUse(Node->AllocTypes);
5120 // If the allocation type is ambiguous, and more aggressive hinting
5121 // has been enabled via the MinClonedColdBytePercent flag, see if this
5122 // allocation should be hinted cold anyway because its fraction cold bytes
5123 // allocated is at least the given threshold.
5124 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5125 !ContextIdToContextSizeInfos.empty()) {
5126 uint64_t TotalCold = 0;
5127 uint64_t Total = 0;
5128 for (auto Id : Node->getContextIds()) {
5129 auto TypeI = ContextIdToAllocationType.find(Id);
5130 assert(TypeI != ContextIdToAllocationType.end());
5131 auto CSI = ContextIdToContextSizeInfos.find(Id);
5132 if (CSI != ContextIdToContextSizeInfos.end()) {
5133 for (auto &Info : CSI->second) {
5134 Total += Info.TotalSize;
5135 if (TypeI->second == AllocationType::Cold)
5136 TotalCold += Info.TotalSize;
5137 }
5138 }
5139 }
5140 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5141 AT = AllocationType::Cold;
5142 }
5143 updateAllocationCall(Node->Call, AT);
5144 assert(Node->MatchingCalls.empty());
5145 return;
5146 }
5147
5148 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5149 return;
5150
5151 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5152 updateCall(Node->Call, CalleeFunc);
5153 // Update all the matching calls as well.
5154 for (auto &Call : Node->MatchingCalls)
5155 updateCall(Call, CalleeFunc);
5156
5157 // Now update all calls recorded earlier that are still in function clones
5158 // which don't have a clone of this callsite node.
5159 if (!UnassignedCallClones.contains(Node))
5160 return;
5161 DenseSet<unsigned> NodeCallClones;
5162 for (auto *C : Node->Clones)
5163 NodeCallClones.insert(C->Call.cloneNo());
5164 // Note that we already confirmed Node is in this map a few lines above.
5165 auto &ClonedCalls = UnassignedCallClones[Node];
5166 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5167 // Should start at 1 as we never create an entry for original node.
5168 assert(CloneNo > 0);
5169 // If we subsequently created a clone, skip this one.
5170 if (NodeCallClones.contains(CloneNo))
5171 continue;
5172 // Use the original Node's CalleeFunc.
5173 for (auto &Call : CallVector)
5174 updateCall(Call, CalleeFunc);
5175 }
5176 };
5177
5178 // Performs DFS traversal starting from allocation nodes to update calls to
5179 // reflect cloning decisions recorded earlier. For regular LTO this will
5180 // update the actual calls in the IR to call the appropriate function clone
5181 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5182 // are recorded in the summary entries.
5183 DenseSet<const ContextNode *> Visited;
5184 for (auto &Entry : AllocationCallToContextNodeMap)
5185 UpdateCalls(Entry.second, Visited, UpdateCalls);
5186
5187 return Changed;
5188}
5189
5191 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5193 &FuncToAliasMap) {
5194 // The first "clone" is the original copy, we should only call this if we
5195 // needed to create new clones.
5196 assert(NumClones > 1);
5198 VMaps.reserve(NumClones - 1);
5199 FunctionsClonedThinBackend++;
5200 for (unsigned I = 1; I < NumClones; I++) {
5201 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5202 auto *NewF = CloneFunction(&F, *VMaps.back());
5203 FunctionClonesThinBackend++;
5204 // Strip memprof and callsite metadata from clone as they are no longer
5205 // needed.
5206 for (auto &BB : *NewF) {
5207 for (auto &Inst : BB) {
5208 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5209 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5210 }
5211 }
5212 std::string Name = getMemProfFuncName(F.getName(), I);
5213 auto *PrevF = M.getFunction(Name);
5214 if (PrevF) {
5215 // We might have created this when adjusting callsite in another
5216 // function. It should be a declaration.
5217 assert(PrevF->isDeclaration());
5218 NewF->takeName(PrevF);
5219 PrevF->replaceAllUsesWith(NewF);
5220 PrevF->eraseFromParent();
5221 } else
5222 NewF->setName(Name);
5223 updateSubprogramLinkageName(NewF, Name);
5224 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5225 << "created clone " << ore::NV("NewFunction", NewF));
5226
5227 // Now handle aliases to this function, and clone those as well.
5228 if (!FuncToAliasMap.count(&F))
5229 continue;
5230 for (auto *A : FuncToAliasMap[&F]) {
5231 std::string Name = getMemProfFuncName(A->getName(), I);
5232 auto *PrevA = M.getNamedAlias(Name);
5233 auto *NewA = GlobalAlias::create(A->getValueType(),
5234 A->getType()->getPointerAddressSpace(),
5235 A->getLinkage(), Name, NewF);
5236 NewA->copyAttributesFrom(A);
5237 if (PrevA) {
5238 // We might have created this when adjusting callsite in another
5239 // function. It should be a declaration.
5240 assert(PrevA->isDeclaration());
5241 NewA->takeName(PrevA);
5242 PrevA->replaceAllUsesWith(NewA);
5243 PrevA->eraseFromParent();
5244 }
5245 }
5246 }
5247 return VMaps;
5248}
5249
5250// Locate the summary for F. This is complicated by the fact that it might
5251// have been internalized or promoted.
5253 const ModuleSummaryIndex *ImportSummary,
5254 const Function *CallingFunc = nullptr) {
5255 // FIXME: Ideally we would retain the original GUID in some fashion on the
5256 // function (e.g. as metadata), but for now do our best to locate the
5257 // summary without that information.
5258 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5259 if (!TheFnVI)
5260 // See if theFn was internalized, by checking index directly with
5261 // original name (this avoids the name adjustment done by getGUID() for
5262 // internal symbols).
5263 TheFnVI = ImportSummary->getValueInfo(
5265 if (TheFnVI)
5266 return TheFnVI;
5267 // Now query with the original name before any promotion was performed.
5268 StringRef OrigName =
5270 // When this pass is enabled, we always add thinlto_src_file provenance
5271 // metadata to imported function definitions, which allows us to recreate the
5272 // original internal symbol's GUID.
5273 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5274 // If this is a call to an imported/promoted local for which we didn't import
5275 // the definition, the metadata will not exist on the declaration. However,
5276 // since we are doing this early, before any inlining in the LTO backend, we
5277 // can simply look at the metadata on the calling function which must have
5278 // been from the same module if F was an internal symbol originally.
5279 if (!SrcFileMD && F.isDeclaration()) {
5280 // We would only call this for a declaration for a direct callsite, in which
5281 // case the caller would have provided the calling function pointer.
5282 assert(CallingFunc);
5283 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5284 // If this is a promoted local (OrigName != F.getName()), since this is a
5285 // declaration, it must be imported from a different module and therefore we
5286 // should always find the metadata on its calling function. Any call to a
5287 // promoted local that came from this module should still be a definition.
5288 assert(SrcFileMD || OrigName == F.getName());
5289 }
5290 StringRef SrcFile = M.getSourceFileName();
5291 if (SrcFileMD)
5292 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5293 std::string OrigId = GlobalValue::getGlobalIdentifier(
5294 OrigName, GlobalValue::InternalLinkage, SrcFile);
5295 TheFnVI = ImportSummary->getValueInfo(
5297 // Internal func in original module may have gotten a numbered suffix if we
5298 // imported an external function with the same name. This happens
5299 // automatically during IR linking for naming conflicts. It would have to
5300 // still be internal in that case (otherwise it would have been renamed on
5301 // promotion in which case we wouldn't have a naming conflict).
5302 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5303 F.getName().contains('.')) {
5304 OrigName = F.getName().rsplit('.').first;
5306 OrigName, GlobalValue::InternalLinkage, SrcFile);
5307 TheFnVI = ImportSummary->getValueInfo(
5309 }
5310 // The only way we may not have a VI is if this is a declaration created for
5311 // an imported reference. For distributed ThinLTO we may not have a VI for
5312 // such declarations in the distributed summary.
5313 assert(TheFnVI || F.isDeclaration());
5314 return TheFnVI;
5315}
5316
5317bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5318 Module &M) {
5319 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5320 Symtab = std::make_unique<InstrProfSymtab>();
5321 // Don't add canonical names, to avoid multiple functions to the symtab
5322 // when they both have the same root name with "." suffixes stripped.
5323 // If we pick the wrong one then this could lead to incorrect ICP and calling
5324 // a memprof clone that we don't actually create (resulting in linker unsats).
5325 // What this means is that the GUID of the function (or its PGOFuncName
5326 // metadata) *must* match that in the VP metadata to allow promotion.
5327 // In practice this should not be a limitation, since local functions should
5328 // have PGOFuncName metadata and global function names shouldn't need any
5329 // special handling (they should not get the ".llvm.*" suffix that the
5330 // canonicalization handling is attempting to strip).
5331 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5332 std::string SymtabFailure = toString(std::move(E));
5333 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5334 return false;
5335 }
5336 return true;
5337}
5338
5339#ifndef NDEBUG
5340// Sanity check that the MIB stack ids match between the summary and
5341// instruction metadata.
5343 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5344 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5345 const ModuleSummaryIndex *ImportSummary) {
5346 auto MIBIter = AllocNode.MIBs.begin();
5347 for (auto &MDOp : MemProfMD->operands()) {
5348 assert(MIBIter != AllocNode.MIBs.end());
5349 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5350 auto *MIBMD = cast<const MDNode>(MDOp);
5351 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5352 assert(StackMDNode);
5353 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5354 auto ContextIterBegin =
5355 StackContext.beginAfterSharedPrefix(CallsiteContext);
5356 // Skip the checking on the first iteration.
5357 uint64_t LastStackContextId =
5358 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5359 : 0;
5360 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5361 ++ContextIter) {
5362 // If this is a direct recursion, simply skip the duplicate
5363 // entries, to be consistent with how the summary ids were
5364 // generated during ModuleSummaryAnalysis.
5365 if (LastStackContextId == *ContextIter)
5366 continue;
5367 LastStackContextId = *ContextIter;
5368 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5369 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5370 *ContextIter);
5371 StackIdIndexIter++;
5372 }
5373 MIBIter++;
5374 }
5375}
5376#endif
5377
5378bool MemProfContextDisambiguation::applyImport(Module &M) {
5379 assert(ImportSummary);
5380 bool Changed = false;
5381
5382 // We also need to clone any aliases that reference cloned functions, because
5383 // the modified callsites may invoke via the alias. Keep track of the aliases
5384 // for each function.
5385 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5386 FuncToAliasMap;
5387 for (auto &A : M.aliases()) {
5388 auto *Aliasee = A.getAliaseeObject();
5389 if (auto *F = dyn_cast<Function>(Aliasee))
5390 FuncToAliasMap[F].insert(&A);
5391 }
5392
5393 if (!initializeIndirectCallPromotionInfo(M))
5394 return false;
5395
5396 for (auto &F : M) {
5397 if (F.isDeclaration() || isMemProfClone(F))
5398 continue;
5399
5400 OptimizationRemarkEmitter ORE(&F);
5401
5403 bool ClonesCreated = false;
5404 unsigned NumClonesCreated = 0;
5405 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
5406 // We should at least have version 0 which is the original copy.
5407 assert(NumClones > 0);
5408 // If only one copy needed use original.
5409 if (NumClones == 1)
5410 return;
5411 // If we already performed cloning of this function, confirm that the
5412 // requested number of clones matches (the thin link should ensure the
5413 // number of clones for each constituent callsite is consistent within
5414 // each function), before returning.
5415 if (ClonesCreated) {
5416 assert(NumClonesCreated == NumClones);
5417 return;
5418 }
5419 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
5420 // The first "clone" is the original copy, which doesn't have a VMap.
5421 assert(VMaps.size() == NumClones - 1);
5422 Changed = true;
5423 ClonesCreated = true;
5424 NumClonesCreated = NumClones;
5425 };
5426
5427 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5428 Function *CalledFunction) {
5429 // Perform cloning if not yet done.
5430 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
5431
5432 assert(!isMemProfClone(*CalledFunction));
5433
5434 // Because we update the cloned calls by calling setCalledOperand (see
5435 // comment below), out of an abundance of caution make sure the called
5436 // function was actually the called operand (or its aliasee). We also
5437 // strip pointer casts when looking for calls (to match behavior during
5438 // summary generation), however, with opaque pointers in theory this
5439 // should not be an issue. Note we still clone the current function
5440 // (containing this call) above, as that could be needed for its callers.
5441 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5442 if (CalledFunction != CB->getCalledOperand() &&
5443 (!GA || CalledFunction != GA->getAliaseeObject())) {
5444 SkippedCallsCloning++;
5445 return;
5446 }
5447 // Update the calls per the summary info.
5448 // Save orig name since it gets updated in the first iteration
5449 // below.
5450 auto CalleeOrigName = CalledFunction->getName();
5451 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5452 // Do nothing if this version calls the original version of its
5453 // callee.
5454 if (!StackNode.Clones[J])
5455 continue;
5456 auto NewF = M.getOrInsertFunction(
5457 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5458 CalledFunction->getFunctionType());
5459 CallBase *CBClone;
5460 // Copy 0 is the original function.
5461 if (!J)
5462 CBClone = CB;
5463 else
5464 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5465 // Set the called operand directly instead of calling setCalledFunction,
5466 // as the latter mutates the function type on the call. In rare cases
5467 // we may have a slightly different type on a callee function
5468 // declaration due to it being imported from a different module with
5469 // incomplete types. We really just want to change the name of the
5470 // function to the clone, and not make any type changes.
5471 CBClone->setCalledOperand(NewF.getCallee());
5472 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5473 << ore::NV("Call", CBClone) << " in clone "
5474 << ore::NV("Caller", CBClone->getFunction())
5475 << " assigned to call function clone "
5476 << ore::NV("Callee", NewF.getCallee()));
5477 }
5478 };
5479
5480 // Locate the summary for F.
5481 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5482 // If not found, this could be an imported local (see comment in
5483 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5484 // module (where it would have been promoted to global scope so should
5485 // satisfy any reference in this module).
5486 if (!TheFnVI)
5487 continue;
5488
5489 auto *GVSummary =
5490 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5491 if (!GVSummary) {
5492 // Must have been imported, use the summary which matches the definition。
5493 // (might be multiple if this was a linkonce_odr).
5494 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5495 assert(SrcModuleMD &&
5496 "enable-import-metadata is needed to emit thinlto_src_module");
5497 StringRef SrcModule =
5498 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5499 for (auto &GVS : TheFnVI.getSummaryList()) {
5500 if (GVS->modulePath() == SrcModule) {
5501 GVSummary = GVS.get();
5502 break;
5503 }
5504 }
5505 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5506 }
5507
5508 // If this was an imported alias skip it as we won't have the function
5509 // summary, and it should be cloned in the original module.
5510 if (isa<AliasSummary>(GVSummary))
5511 continue;
5512
5513 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5514
5515 if (FS->allocs().empty() && FS->callsites().empty())
5516 continue;
5517
5518 auto SI = FS->callsites().begin();
5519 auto AI = FS->allocs().begin();
5520
5521 // To handle callsite infos synthesized for tail calls which have missing
5522 // frames in the profiled context, map callee VI to the synthesized callsite
5523 // info.
5524 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5525 // Iterate the callsites for this function in reverse, since we place all
5526 // those synthesized for tail calls at the end.
5527 for (auto CallsiteIt = FS->callsites().rbegin();
5528 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5529 auto &Callsite = *CallsiteIt;
5530 // Stop as soon as we see a non-synthesized callsite info (see comment
5531 // above loop). All the entries added for discovered tail calls have empty
5532 // stack ids.
5533 if (!Callsite.StackIdIndices.empty())
5534 break;
5535 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5536 }
5537
5538 // Keeps track of needed ICP for the function.
5539 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5540
5541 // Assume for now that the instructions are in the exact same order
5542 // as when the summary was created, but confirm this is correct by
5543 // matching the stack ids.
5544 for (auto &BB : F) {
5545 for (auto &I : BB) {
5546 auto *CB = dyn_cast<CallBase>(&I);
5547 // Same handling as when creating module summary.
5548 if (!mayHaveMemprofSummary(CB))
5549 continue;
5550
5551 auto *CalledValue = CB->getCalledOperand();
5552 auto *CalledFunction = CB->getCalledFunction();
5553 if (CalledValue && !CalledFunction) {
5554 CalledValue = CalledValue->stripPointerCasts();
5555 // Stripping pointer casts can reveal a called function.
5556 CalledFunction = dyn_cast<Function>(CalledValue);
5557 }
5558 // Check if this is an alias to a function. If so, get the
5559 // called aliasee for the checks below.
5560 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5561 assert(!CalledFunction &&
5562 "Expected null called function in callsite for alias");
5563 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5564 }
5565
5566 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5567 I.getMetadata(LLVMContext::MD_callsite));
5568 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5569
5570 // Include allocs that were already assigned a memprof function
5571 // attribute in the statistics.
5572 if (CB->getAttributes().hasFnAttr("memprof")) {
5573 assert(!MemProfMD);
5574 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5575 ? AllocTypeColdThinBackend++
5576 : AllocTypeNotColdThinBackend++;
5577 OrigAllocsThinBackend++;
5578 AllocVersionsThinBackend++;
5579 if (!MaxAllocVersionsThinBackend)
5580 MaxAllocVersionsThinBackend = 1;
5581 continue;
5582 }
5583
5584 if (MemProfMD) {
5585 // Consult the next alloc node.
5586 assert(AI != FS->allocs().end());
5587 auto &AllocNode = *(AI++);
5588
5589#ifndef NDEBUG
5590 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5591 ImportSummary);
5592#endif
5593
5594 // Perform cloning if not yet done.
5595 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
5596
5597 OrigAllocsThinBackend++;
5598 AllocVersionsThinBackend += AllocNode.Versions.size();
5599 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5600 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5601
5602 // If there is only one version that means we didn't end up
5603 // considering this function for cloning, and in that case the alloc
5604 // will still be none type or should have gotten the default NotCold.
5605 // Skip that after calling clone helper since that does some sanity
5606 // checks that confirm we haven't decided yet that we need cloning.
5607 // We might have a single version that is cold due to the
5608 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5609 // case.
5610 if (AllocNode.Versions.size() == 1 &&
5611 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5612 assert((AllocationType)AllocNode.Versions[0] ==
5613 AllocationType::NotCold ||
5614 (AllocationType)AllocNode.Versions[0] ==
5615 AllocationType::None);
5616 UnclonableAllocsThinBackend++;
5617 continue;
5618 }
5619
5620 // All versions should have a singular allocation type.
5621 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5622 return Type == ((uint8_t)AllocationType::NotCold |
5623 (uint8_t)AllocationType::Cold);
5624 }));
5625
5626 // Update the allocation types per the summary info.
5627 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5628 // Ignore any that didn't get an assigned allocation type.
5629 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5630 continue;
5631 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5632 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5633 : AllocTypeNotColdThinBackend++;
5634 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
5635 auto A = llvm::Attribute::get(F.getContext(), "memprof",
5636 AllocTypeString);
5637 CallBase *CBClone;
5638 // Copy 0 is the original function.
5639 if (!J)
5640 CBClone = CB;
5641 else
5642 // Since VMaps are only created for new clones, we index with
5643 // clone J-1 (J==0 is the original clone and does not have a VMaps
5644 // entry).
5645 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5647 CBClone->addFnAttr(A);
5648 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5649 << ore::NV("AllocationCall", CBClone) << " in clone "
5650 << ore::NV("Caller", CBClone->getFunction())
5651 << " marked with memprof allocation attribute "
5652 << ore::NV("Attribute", AllocTypeString));
5653 }
5654 } else if (!CallsiteContext.empty()) {
5655 if (!CalledFunction) {
5656#ifndef NDEBUG
5657 // We should have skipped inline assembly calls.
5658 auto *CI = dyn_cast<CallInst>(CB);
5659 assert(!CI || !CI->isInlineAsm());
5660#endif
5661 // We should have skipped direct calls via a Constant.
5662 assert(CalledValue && !isa<Constant>(CalledValue));
5663
5664 // This is an indirect call, see if we have profile information and
5665 // whether any clones were recorded for the profiled targets (that
5666 // we synthesized CallsiteInfo summary records for when building the
5667 // index).
5668 auto NumClones =
5669 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
5670
5671 // Perform cloning if not yet done. This is done here in case
5672 // we don't need to do ICP, but might need to clone this
5673 // function as it is the target of other cloned calls.
5674 if (NumClones)
5675 CloneFuncIfNeeded(NumClones);
5676 }
5677
5678 else {
5679 // Consult the next callsite node.
5680 assert(SI != FS->callsites().end());
5681 auto &StackNode = *(SI++);
5682
5683#ifndef NDEBUG
5684 // Sanity check that the stack ids match between the summary and
5685 // instruction metadata.
5686 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5687 for (auto StackId : CallsiteContext) {
5688 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5689 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5690 StackId);
5691 StackIdIndexIter++;
5692 }
5693#endif
5694
5695 CloneCallsite(StackNode, CB, CalledFunction);
5696 }
5697 } else if (CB->isTailCall() && CalledFunction) {
5698 // Locate the synthesized callsite info for the callee VI, if any was
5699 // created, and use that for cloning.
5700 ValueInfo CalleeVI =
5701 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
5702 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
5703 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
5704 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5705 CloneCallsite(Callsite->second, CB, CalledFunction);
5706 }
5707 }
5708 }
5709 }
5710
5711 // Now do any promotion required for cloning.
5712 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5713 }
5714
5715 // We skip some of the functions and instructions above, so remove all the
5716 // metadata in a single sweep here.
5717 for (auto &F : M) {
5718 // We can skip memprof clones because createFunctionClones already strips
5719 // the metadata from the newly created clones.
5720 if (F.isDeclaration() || isMemProfClone(F))
5721 continue;
5722 for (auto &BB : F) {
5723 for (auto &I : BB) {
5724 if (!isa<CallBase>(I))
5725 continue;
5726 I.setMetadata(LLVMContext::MD_memprof, nullptr);
5727 I.setMetadata(LLVMContext::MD_callsite, nullptr);
5728 }
5729 }
5730 }
5731
5732 return Changed;
5733}
5734
5735unsigned MemProfContextDisambiguation::recordICPInfo(
5736 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5738 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5739 // First see if we have profile information for this indirect call.
5740 uint32_t NumCandidates;
5741 uint64_t TotalCount;
5742 auto CandidateProfileData =
5743 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
5744 NumCandidates);
5745 if (CandidateProfileData.empty())
5746 return 0;
5747
5748 // Iterate through all of the candidate profiled targets along with the
5749 // CallsiteInfo summary records synthesized for them when building the index,
5750 // and see if any are cloned and/or refer to clones.
5751 bool ICPNeeded = false;
5752 unsigned NumClones = 0;
5753 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
5754 for (const auto &Candidate : CandidateProfileData) {
5755#ifndef NDEBUG
5756 auto CalleeValueInfo =
5757#endif
5758 ImportSummary->getValueInfo(Candidate.Value);
5759 // We might not have a ValueInfo if this is a distributed
5760 // ThinLTO backend and decided not to import that function.
5761 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
5762 assert(SI != AllCallsites.end());
5763 auto &StackNode = *(SI++);
5764 // See if any of the clones of the indirect callsite for this
5765 // profiled target should call a cloned version of the profiled
5766 // target. We only need to do the ICP here if so.
5767 ICPNeeded |= llvm::any_of(StackNode.Clones,
5768 [](unsigned CloneNo) { return CloneNo != 0; });
5769 // Every callsite in the same function should have been cloned the same
5770 // number of times.
5771 assert(!NumClones || NumClones == StackNode.Clones.size());
5772 NumClones = StackNode.Clones.size();
5773 }
5774 if (!ICPNeeded)
5775 return NumClones;
5776 // Save information for ICP, which is performed later to avoid messing up the
5777 // current function traversal.
5778 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
5779 TotalCount, CallsiteInfoStartIndex});
5780 return NumClones;
5781}
5782
5783void MemProfContextDisambiguation::performICP(
5784 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5785 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5786 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5787 OptimizationRemarkEmitter &ORE) {
5788 // Now do any promotion required for cloning. Specifically, for each
5789 // recorded ICP candidate (which was only recorded because one clone of that
5790 // candidate should call a cloned target), we perform ICP (speculative
5791 // devirtualization) for each clone of the callsite, and update its callee
5792 // to the appropriate clone. Note that the ICP compares against the original
5793 // version of the target, which is what is in the vtable.
5794 for (auto &Info : ICallAnalysisInfo) {
5795 auto *CB = Info.CB;
5796 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5797 auto TotalCount = Info.TotalCount;
5798 unsigned NumPromoted = 0;
5799 unsigned NumClones = 0;
5800
5801 for (auto &Candidate : Info.CandidateProfileData) {
5802 auto &StackNode = AllCallsites[CallsiteIndex++];
5803
5804 // All calls in the same function must have the same number of clones.
5805 assert(!NumClones || NumClones == StackNode.Clones.size());
5806 NumClones = StackNode.Clones.size();
5807
5808 // See if the target is in the module. If it wasn't imported, it is
5809 // possible that this profile could have been collected on a different
5810 // target (or version of the code), and we need to be conservative
5811 // (similar to what is done in the ICP pass).
5812 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
5813 if (TargetFunction == nullptr ||
5814 // Any ThinLTO global dead symbol removal should have already
5815 // occurred, so it should be safe to promote when the target is a
5816 // declaration.
5817 // TODO: Remove internal option once more fully tested.
5819 TargetFunction->isDeclaration())) {
5820 ORE.emit([&]() {
5821 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
5822 << "Memprof cannot promote indirect call: target with md5sum "
5823 << ore::NV("target md5sum", Candidate.Value) << " not found";
5824 });
5825 // FIXME: See if we can use the new declaration importing support to
5826 // at least get the declarations imported for this case. Hot indirect
5827 // targets should have been imported normally, however.
5828 continue;
5829 }
5830
5831 // Check if legal to promote
5832 const char *Reason = nullptr;
5833 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
5834 ORE.emit([&]() {
5835 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
5836 << "Memprof cannot promote indirect call to "
5837 << ore::NV("TargetFunction", TargetFunction)
5838 << " with count of " << ore::NV("TotalCount", TotalCount)
5839 << ": " << Reason;
5840 });
5841 continue;
5842 }
5843
5844 assert(!isMemProfClone(*TargetFunction));
5845
5846 // Handle each call clone, applying ICP so that each clone directly
5847 // calls the specified callee clone, guarded by the appropriate ICP
5848 // check.
5849 CallBase *CBClone = CB;
5850 for (unsigned J = 0; J < NumClones; J++) {
5851 // Copy 0 is the original function.
5852 if (J > 0)
5853 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5854 // We do the promotion using the original name, so that the comparison
5855 // is against the name in the vtable. Then just below, change the new
5856 // direct call to call the cloned function.
5857 auto &DirectCall =
5858 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
5859 TotalCount, isSamplePGO, &ORE);
5860 auto *TargetToUse = TargetFunction;
5861 // Call original if this version calls the original version of its
5862 // callee.
5863 if (StackNode.Clones[J]) {
5864 TargetToUse =
5865 cast<Function>(M.getOrInsertFunction(
5866 getMemProfFuncName(TargetFunction->getName(),
5867 StackNode.Clones[J]),
5868 TargetFunction->getFunctionType())
5869 .getCallee());
5870 }
5871 DirectCall.setCalledFunction(TargetToUse);
5872 // During matching we generate synthetic VP metadata for indirect calls
5873 // not already having any, from the memprof profile's callee GUIDs. If
5874 // we subsequently promote and inline those callees, we currently lose
5875 // the ability to generate this synthetic VP metadata. Optionally apply
5876 // a noinline attribute to promoted direct calls, where the threshold is
5877 // set to capture synthetic VP metadata targets which get a count of 1.
5879 Candidate.Count < MemProfICPNoInlineThreshold)
5880 DirectCall.setIsNoInline();
5881 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5882 << ore::NV("Call", CBClone) << " in clone "
5883 << ore::NV("Caller", CBClone->getFunction())
5884 << " promoted and assigned to call function clone "
5885 << ore::NV("Callee", TargetToUse));
5886 }
5887
5888 // Update TotalCount (all clones should get same count above)
5889 TotalCount -= Candidate.Count;
5890 NumPromoted++;
5891 }
5892 // Adjust the MD.prof metadata for all clones, now that we have the new
5893 // TotalCount and the number promoted.
5894 CallBase *CBClone = CB;
5895 for (unsigned J = 0; J < NumClones; J++) {
5896 // Copy 0 is the original function.
5897 if (J > 0)
5898 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5899 // First delete the old one.
5900 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
5901 // If all promoted, we don't need the MD.prof metadata.
5902 // Otherwise we need update with the un-promoted records back.
5903 if (TotalCount != 0)
5905 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
5906 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
5907 }
5908 }
5909}
5910
5911template <typename DerivedCCG, typename FuncTy, typename CallTy>
5912bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
5913 if (DumpCCG) {
5914 dbgs() << "CCG before cloning:\n";
5915 dbgs() << *this;
5916 }
5917 if (ExportToDot)
5918 exportToDot("postbuild");
5919
5920 if (VerifyCCG) {
5921 check();
5922 }
5923
5924 identifyClones();
5925
5926 if (VerifyCCG) {
5927 check();
5928 }
5929
5930 if (DumpCCG) {
5931 dbgs() << "CCG after cloning:\n";
5932 dbgs() << *this;
5933 }
5934 if (ExportToDot)
5935 exportToDot("cloned");
5936
5937 bool Changed = assignFunctions();
5938
5939 if (DumpCCG) {
5940 dbgs() << "CCG after assigning function clones:\n";
5941 dbgs() << *this;
5942 }
5943 if (ExportToDot)
5944 exportToDot("clonefuncassign");
5945
5947 printTotalSizes(errs());
5948
5949 return Changed;
5950}
5951
5952bool MemProfContextDisambiguation::processModule(
5953 Module &M,
5954 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
5955
5956 // If we have an import summary, then the cloning decisions were made during
5957 // the thin link on the index. Apply them and return.
5958 if (ImportSummary)
5959 return applyImport(M);
5960
5961 // TODO: If/when other types of memprof cloning are enabled beyond just for
5962 // hot and cold, we will need to change this to individually control the
5963 // AllocationType passed to addStackNodesForMIB during CCG construction.
5964 // Note that we specifically check this after applying imports above, so that
5965 // the option isn't needed to be passed to distributed ThinLTO backend
5966 // clang processes, which won't necessarily have visibility into the linker
5967 // dependences. Instead the information is communicated from the LTO link to
5968 // the backends via the combined summary index.
5969 if (!SupportsHotColdNew)
5970 return false;
5971
5972 ModuleCallsiteContextGraph CCG(M, OREGetter);
5973 return CCG.process();
5974}
5975
5977 const ModuleSummaryIndex *Summary, bool isSamplePGO)
5978 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
5979 // Check the dot graph printing options once here, to make sure we have valid
5980 // and expected combinations.
5981 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
5983 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
5985 !ContextIdForDot.getNumOccurrences())
5987 "-memprof-dot-scope=context requires -memprof-dot-context-id");
5988 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
5989 ContextIdForDot.getNumOccurrences())
5991 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
5992 "-memprof-dot-context-id");
5993 if (ImportSummary) {
5994 // The MemProfImportSummary should only be used for testing ThinLTO
5995 // distributed backend handling via opt, in which case we don't have a
5996 // summary from the pass pipeline.
5998 return;
5999 }
6000 if (MemProfImportSummary.empty())
6001 return;
6002
6003 auto ReadSummaryFile =
6005 if (!ReadSummaryFile) {
6006 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
6007 "Error loading file '" + MemProfImportSummary +
6008 "': ");
6009 return;
6010 }
6011 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
6012 if (!ImportSummaryForTestingOrErr) {
6013 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
6014 "Error parsing file '" + MemProfImportSummary +
6015 "': ");
6016 return;
6017 }
6018 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6019 ImportSummary = ImportSummaryForTesting.get();
6020}
6021
6024 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
6025 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6026 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
6027 };
6028 if (!processModule(M, OREGetter))
6029 return PreservedAnalyses::all();
6030 return PreservedAnalyses::none();
6031}
6032
6034 ModuleSummaryIndex &Index,
6036 isPrevailing) {
6037 // TODO: If/when other types of memprof cloning are enabled beyond just for
6038 // hot and cold, we will need to change this to individually control the
6039 // AllocationType passed to addStackNodesForMIB during CCG construction.
6040 // The index was set from the option, so these should be in sync.
6041 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6042 if (!SupportsHotColdNew)
6043 return;
6044
6045 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6046 CCG.process();
6047}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
void print(OutputBuffer &OB) const
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
iterator end() const
Definition ArrayRef.h:136
const_pointer iterator
Definition ArrayRef.h:48
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:163
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:158
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
DISubprogram * getSubprogram() const
Get the attached subprogram.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:597
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:77
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:328
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:161
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:669
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1317
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:145
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:695
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:65
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2038
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:733
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34