LLVM 22.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
44#include "llvm/Support/SHA1.h"
46#include "llvm/Transforms/IPO.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110STATISTIC(NumImportantContextIds, "Number of important context ids");
111STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
112STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
113STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
114
116 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
117 cl::value_desc("filename"),
118 cl::desc("Specify the path prefix of the MemProf dot files."));
119
120static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
122 cl::desc("Export graph to dot files."));
123
124// TODO: Remove this option once new handling is validated more widely.
126 "memprof-merge-iteration", cl::init(true), cl::Hidden,
127 cl::desc("Iteratively apply merging on a node to catch new callers"));
128
129// How much of the graph to export to dot.
131 All, // The full CCG graph.
132 Alloc, // Only contexts for the specified allocation.
133 Context, // Only the specified context.
134};
135
137 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
140 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
142 "Export only nodes with contexts feeding given "
143 "-memprof-dot-alloc-id"),
144 clEnumValN(DotScope::Context, "context",
145 "Export only nodes with given -memprof-dot-context-id")));
146
148 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
149 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
150 "or to highlight if -memprof-dot-scope=all"));
151
153 "memprof-dot-context-id", cl::init(0), cl::Hidden,
154 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
155 "highlight otherwise"));
156
157static cl::opt<bool>
158 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
159 cl::desc("Dump CallingContextGraph to stdout after each stage."));
160
161static cl::opt<bool>
162 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
163 cl::desc("Perform verification checks on CallingContextGraph."));
164
165static cl::opt<bool>
166 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
167 cl::desc("Perform frequent verification checks on nodes."));
168
170 "memprof-import-summary",
171 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
172 cl::Hidden);
173
175 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
177 cl::desc("Max depth to recursively search for missing "
178 "frames through tail calls."));
179
180// Optionally enable cloning of callsites involved with recursive cycles
182 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
183 cl::desc("Allow cloning of callsites involved in recursive cycles"));
184
186 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
187 cl::desc("Allow cloning of contexts through recursive cycles"));
188
189// Generally this is needed for correct assignment of allocation clones to
190// function clones, however, allow it to be disabled for debugging while the
191// functionality is new and being tested more widely.
192static cl::opt<bool>
193 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
194 cl::desc("Merge clones before assigning functions"));
195
196// When disabled, try to detect and prevent cloning of recursive contexts.
197// This is only necessary until we support cloning through recursive cycles.
198// Leave on by default for now, as disabling requires a little bit of compile
199// time overhead and doesn't affect correctness, it will just inflate the cold
200// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
202 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
203 cl::desc("Allow cloning of contexts having recursive cycles"));
204
205// Set the minimum absolute count threshold for allowing inlining of indirect
206// calls promoted during cloning.
208 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
209 cl::desc("Minimum absolute count for promoted target to be inlinable"));
210
211namespace llvm {
213 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
214 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
215
216// Indicate we are linking with an allocator that supports hot/cold operator
217// new interfaces.
219 "supports-hot-cold-new", cl::init(false), cl::Hidden,
220 cl::desc("Linking with hot/cold operator new interfaces"));
221
223 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
224 cl::desc(
225 "Require target function definition when promoting indirect calls"));
226
229
231 "memprof-top-n-important", cl::init(10), cl::Hidden,
232 cl::desc("Number of largest cold contexts to consider important"));
233
235 "memprof-fixup-important", cl::init(true), cl::Hidden,
236 cl::desc("Enables edge fixup for important contexts"));
237
239
240} // namespace llvm
241
242namespace {
243
244/// CRTP base for graphs built from either IR or ThinLTO summary index.
245///
246/// The graph represents the call contexts in all memprof metadata on allocation
247/// calls, with nodes for the allocations themselves, as well as for the calls
248/// in each context. The graph is initially built from the allocation memprof
249/// metadata (or summary) MIBs. It is then updated to match calls with callsite
250/// metadata onto the nodes, updating it to reflect any inlining performed on
251/// those calls.
252///
253/// Each MIB (representing an allocation's call context with allocation
254/// behavior) is assigned a unique context id during the graph build. The edges
255/// and nodes in the graph are decorated with the context ids they carry. This
256/// is used to correctly update the graph when cloning is performed so that we
257/// can uniquify the context for a single (possibly cloned) allocation.
258template <typename DerivedCCG, typename FuncTy, typename CallTy>
259class CallsiteContextGraph {
260public:
261 CallsiteContextGraph() = default;
262 CallsiteContextGraph(const CallsiteContextGraph &) = default;
263 CallsiteContextGraph(CallsiteContextGraph &&) = default;
264
265 /// Main entry point to perform analysis and transformations on graph.
266 bool process();
267
268 /// Perform cloning on the graph necessary to uniquely identify the allocation
269 /// behavior of an allocation based on its context.
270 void identifyClones();
271
272 /// Assign callsite clones to functions, cloning functions as needed to
273 /// accommodate the combinations of their callsite clones reached by callers.
274 /// For regular LTO this clones functions and callsites in the IR, but for
275 /// ThinLTO the cloning decisions are noted in the summaries and later applied
276 /// in applyImport.
277 bool assignFunctions();
278
279 void dump() const;
280 void print(raw_ostream &OS) const;
281 void printTotalSizes(raw_ostream &OS) const;
282
284 const CallsiteContextGraph &CCG) {
285 CCG.print(OS);
286 return OS;
287 }
288
289 friend struct GraphTraits<
290 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
291 friend struct DOTGraphTraits<
292 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
293
294 void exportToDot(std::string Label) const;
295
296 /// Represents a function clone via FuncTy pointer and clone number pair.
297 struct FuncInfo final
298 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
299 using Base = std::pair<FuncTy *, unsigned>;
300 FuncInfo(const Base &B) : Base(B) {}
301 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
302 explicit operator bool() const { return this->first != nullptr; }
303 FuncTy *func() const { return this->first; }
304 unsigned cloneNo() const { return this->second; }
305 };
306
307 /// Represents a callsite clone via CallTy and clone number pair.
308 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
309 using Base = std::pair<CallTy, unsigned>;
310 CallInfo(const Base &B) : Base(B) {}
311 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
312 : Base(Call, CloneNo) {}
313 explicit operator bool() const { return (bool)this->first; }
314 CallTy call() const { return this->first; }
315 unsigned cloneNo() const { return this->second; }
316 void setCloneNo(unsigned N) { this->second = N; }
317 void print(raw_ostream &OS) const {
318 if (!operator bool()) {
319 assert(!cloneNo());
320 OS << "null Call";
321 return;
322 }
323 call()->print(OS);
324 OS << "\t(clone " << cloneNo() << ")";
325 }
326 void dump() const {
327 print(dbgs());
328 dbgs() << "\n";
329 }
330 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
331 Call.print(OS);
332 return OS;
333 }
334 };
335
336 struct ContextEdge;
337
338 /// Node in the Callsite Context Graph
339 struct ContextNode {
340 // Assigned to nodes as they are created, useful for debugging.
341 unsigned NodeId = 0;
342
343 // Keep this for now since in the IR case where we have an Instruction* it
344 // is not as immediately discoverable. Used for printing richer information
345 // when dumping graph.
346 bool IsAllocation;
347
348 // Keeps track of when the Call was reset to null because there was
349 // recursion.
350 bool Recursive = false;
351
352 // This will be formed by ORing together the AllocationType enum values
353 // for contexts including this node.
354 uint8_t AllocTypes = 0;
355
356 // The corresponding allocation or interior call. This is the primary call
357 // for which we have created this node.
358 CallInfo Call;
359
360 // List of other calls that can be treated the same as the primary call
361 // through cloning. I.e. located in the same function and have the same
362 // (possibly pruned) stack ids. They will be updated the same way as the
363 // primary call when assigning to function clones.
364 SmallVector<CallInfo, 0> MatchingCalls;
365
366 // For alloc nodes this is a unique id assigned when constructed, and for
367 // callsite stack nodes it is the original stack id when the node is
368 // constructed from the memprof MIB metadata on the alloc nodes. Note that
369 // this is only used when matching callsite metadata onto the stack nodes
370 // created when processing the allocation memprof MIBs, and for labeling
371 // nodes in the dot graph. Therefore we don't bother to assign a value for
372 // clones.
373 uint64_t OrigStackOrAllocId = 0;
374
375 // Edges to all callees in the profiled call stacks.
376 // TODO: Should this be a map (from Callee node) for more efficient lookup?
377 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
378
379 // Edges to all callers in the profiled call stacks.
380 // TODO: Should this be a map (from Caller node) for more efficient lookup?
381 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
382
383 // Returns true if we need to look at the callee edges for determining the
384 // node context ids and allocation type.
385 bool useCallerEdgesForContextInfo() const {
386 // Typically if the callee edges are empty either the caller edges are
387 // also empty, or this is an allocation (leaf node). However, if we are
388 // allowing recursive callsites and contexts this will be violated for
389 // incompletely cloned recursive cycles.
390 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
392 // When cloning for a recursive context, during cloning we might be in the
393 // midst of cloning for a recurrence and have moved context ids off of a
394 // caller edge onto the clone but not yet off of the incoming caller
395 // (back) edge. If we don't look at those we miss the fact that this node
396 // still has context ids of interest.
397 return IsAllocation || CloneRecursiveContexts;
398 }
399
400 // Compute the context ids for this node from the union of its edge context
401 // ids.
402 DenseSet<uint32_t> getContextIds() const {
403 unsigned Count = 0;
404 // Compute the number of ids for reserve below. In general we only need to
405 // look at one set of edges, typically the callee edges, since other than
406 // allocations and in some cases during recursion cloning, all the context
407 // ids on the callers should also flow out via callee edges.
408 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
409 Count += Edge->getContextIds().size();
410 DenseSet<uint32_t> ContextIds;
411 ContextIds.reserve(Count);
413 CalleeEdges, useCallerEdgesForContextInfo()
414 ? CallerEdges
415 : std::vector<std::shared_ptr<ContextEdge>>());
416 for (const auto &Edge : Edges)
417 ContextIds.insert_range(Edge->getContextIds());
418 return ContextIds;
419 }
420
421 // Compute the allocation type for this node from the OR of its edge
422 // allocation types.
423 uint8_t computeAllocType() const {
424 uint8_t BothTypes =
428 CalleeEdges, useCallerEdgesForContextInfo()
429 ? CallerEdges
430 : std::vector<std::shared_ptr<ContextEdge>>());
431 for (const auto &Edge : Edges) {
432 AllocType |= Edge->AllocTypes;
433 // Bail early if alloc type reached both, no further refinement.
434 if (AllocType == BothTypes)
435 return AllocType;
436 }
437 return AllocType;
438 }
439
440 // The context ids set for this node is empty if its edge context ids are
441 // also all empty.
442 bool emptyContextIds() const {
444 CalleeEdges, useCallerEdgesForContextInfo()
445 ? CallerEdges
446 : std::vector<std::shared_ptr<ContextEdge>>());
447 for (const auto &Edge : Edges) {
448 if (!Edge->getContextIds().empty())
449 return false;
450 }
451 return true;
452 }
453
454 // List of clones of this ContextNode, initially empty.
455 std::vector<ContextNode *> Clones;
456
457 // If a clone, points to the original uncloned node.
458 ContextNode *CloneOf = nullptr;
459
460 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
461
462 ContextNode(bool IsAllocation, CallInfo C)
463 : IsAllocation(IsAllocation), Call(C) {}
464
465 void addClone(ContextNode *Clone) {
466 if (CloneOf) {
467 CloneOf->Clones.push_back(Clone);
468 Clone->CloneOf = CloneOf;
469 } else {
470 Clones.push_back(Clone);
471 assert(!Clone->CloneOf);
472 Clone->CloneOf = this;
473 }
474 }
475
476 ContextNode *getOrigNode() {
477 if (!CloneOf)
478 return this;
479 return CloneOf;
480 }
481
482 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
483 unsigned int ContextId);
484
485 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
486 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
487 void eraseCalleeEdge(const ContextEdge *Edge);
488 void eraseCallerEdge(const ContextEdge *Edge);
489
490 void setCall(CallInfo C) { Call = C; }
491
492 bool hasCall() const { return (bool)Call.call(); }
493
494 void printCall(raw_ostream &OS) const { Call.print(OS); }
495
496 // True if this node was effectively removed from the graph, in which case
497 // it should have an allocation type of None and empty context ids.
498 bool isRemoved() const {
499 // Typically if the callee edges are empty either the caller edges are
500 // also empty, or this is an allocation (leaf node). However, if we are
501 // allowing recursive callsites and contexts this will be violated for
502 // incompletely cloned recursive cycles.
504 (AllocTypes == (uint8_t)AllocationType::None) ==
505 emptyContextIds());
506 return AllocTypes == (uint8_t)AllocationType::None;
507 }
508
509 void dump() const;
510 void print(raw_ostream &OS) const;
511
512 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
513 Node.print(OS);
514 return OS;
515 }
516 };
517
518 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
519 /// callee.
520 struct ContextEdge {
521 ContextNode *Callee;
522 ContextNode *Caller;
523
524 // This will be formed by ORing together the AllocationType enum values
525 // for contexts including this edge.
526 uint8_t AllocTypes = 0;
527
528 // Set just before initiating cloning when cloning of recursive contexts is
529 // enabled. Used to defer cloning of backedges until we have done cloning of
530 // the callee node for non-backedge caller edges. This exposes cloning
531 // opportunities through the backedge of the cycle.
532 // TODO: Note that this is not updated during cloning, and it is unclear
533 // whether that would be needed.
534 bool IsBackedge = false;
535
536 // The set of IDs for contexts including this edge.
537 DenseSet<uint32_t> ContextIds;
538
539 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
540 DenseSet<uint32_t> ContextIds)
541 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
542 ContextIds(std::move(ContextIds)) {}
543
544 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
545
546 // Helper to clear the fields of this edge when we are removing it from the
547 // graph.
548 inline void clear() {
549 ContextIds.clear();
550 AllocTypes = (uint8_t)AllocationType::None;
551 Caller = nullptr;
552 Callee = nullptr;
553 }
554
555 // Check if edge was removed from the graph. This is useful while iterating
556 // over a copy of edge lists when performing operations that mutate the
557 // graph in ways that might remove one of the edges.
558 inline bool isRemoved() const {
559 if (Callee || Caller)
560 return false;
561 // Any edges that have been removed from the graph but are still in a
562 // shared_ptr somewhere should have all fields null'ed out by clear()
563 // above.
564 assert(AllocTypes == (uint8_t)AllocationType::None);
565 assert(ContextIds.empty());
566 return true;
567 }
568
569 void dump() const;
570 void print(raw_ostream &OS) const;
571
572 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
573 Edge.print(OS);
574 return OS;
575 }
576 };
577
578 /// Helpers to remove edges that have allocation type None (due to not
579 /// carrying any context ids) after transformations.
580 void removeNoneTypeCalleeEdges(ContextNode *Node);
581 void removeNoneTypeCallerEdges(ContextNode *Node);
582 void
583 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
585
586protected:
587 /// Get a list of nodes corresponding to the stack ids in the given callsite
588 /// context.
589 template <class NodeT, class IteratorT>
590 std::vector<uint64_t>
591 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
592
593 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
594 /// metadata (or summary).
595 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
596
597 /// Adds nodes for the given MIB stack ids.
598 template <class NodeT, class IteratorT>
599 void addStackNodesForMIB(
600 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
602 ArrayRef<ContextTotalSize> ContextSizeInfo,
603 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
604
605 /// Matches all callsite metadata (or summary) to the nodes created for
606 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
607 /// inlining performed on those callsite instructions.
608 void updateStackNodes();
609
610 /// Optionally fixup edges for the N largest cold contexts to better enable
611 /// cloning. This is particularly helpful if the context includes recursion
612 /// as well as inlining, resulting in a single stack node for multiple stack
613 /// ids in the context. With recursion it is particularly difficult to get the
614 /// edge updates correct as in the general case we have lost the original
615 /// stack id ordering for the context. Do more expensive fixup for the largest
616 /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
617 void fixupImportantContexts();
618
619 /// Update graph to conservatively handle any callsite stack nodes that target
620 /// multiple different callee target functions.
621 void handleCallsitesWithMultipleTargets();
622
623 /// Mark backedges via the standard DFS based backedge algorithm.
624 void markBackedges();
625
626 /// Merge clones generated during cloning for different allocations but that
627 /// are called by the same caller node, to ensure proper function assignment.
628 void mergeClones();
629
630 // Try to partition calls on the given node (already placed into the AllCalls
631 // array) by callee function, creating new copies of Node as needed to hold
632 // calls with different callees, and moving the callee edges appropriately.
633 // Returns true if partitioning was successful.
634 bool partitionCallsByCallee(
635 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
636 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
637
638 /// Save lists of calls with MemProf metadata in each function, for faster
639 /// iteration.
640 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
641
642 /// Map from callsite node to the enclosing caller function.
643 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
644
645 // When exporting to dot, and an allocation id is specified, contains the
646 // context ids on that allocation.
647 DenseSet<uint32_t> DotAllocContextIds;
648
649private:
650 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
651
652 // Structure to keep track of information for each call as we are matching
653 // non-allocation callsites onto context nodes created from the allocation
654 // call metadata / summary contexts.
655 struct CallContextInfo {
656 // The callsite we're trying to match.
657 CallTy Call;
658 // The callsites stack ids that have a context node in the graph.
659 std::vector<uint64_t> StackIds;
660 // The function containing this callsite.
661 const FuncTy *Func;
662 // Initially empty, if needed this will be updated to contain the context
663 // ids for use in a new context node created for this callsite.
664 DenseSet<uint32_t> ContextIds;
665 };
666
667 /// Helper to remove edge from graph, updating edge iterator if it is provided
668 /// (in which case CalleeIter indicates which edge list is being iterated).
669 /// This will also perform the necessary clearing of the ContextEdge members
670 /// to enable later checking if the edge has been removed (since we may have
671 /// other copies of the shared_ptr in existence, and in fact rely on this to
672 /// enable removal while iterating over a copy of a node's edge list).
673 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
674 bool CalleeIter = true);
675
676 /// Assigns the given Node to calls at or inlined into the location with
677 /// the Node's stack id, after post order traversing and processing its
678 /// caller nodes. Uses the call information recorded in the given
679 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
680 /// as needed. Called by updateStackNodes which sets up the given
681 /// StackIdToMatchingCalls map.
682 void assignStackNodesPostOrder(
683 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
684 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
685 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
686 const DenseSet<uint32_t> &ImportantContextIds);
687
688 /// Duplicates the given set of context ids, updating the provided
689 /// map from each original id with the newly generated context ids,
690 /// and returning the new duplicated id set.
691 DenseSet<uint32_t> duplicateContextIds(
692 const DenseSet<uint32_t> &StackSequenceContextIds,
693 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
694
695 /// Propagates all duplicated context ids across the graph.
696 void propagateDuplicateContextIds(
697 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
698
699 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
700 /// else to its callers. Also updates OrigNode's edges to remove any context
701 /// ids moved to the newly created edge.
702 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
703 bool TowardsCallee,
704 DenseSet<uint32_t> RemainingContextIds);
705
706 /// Get the stack id corresponding to the given Id or Index (for IR this will
707 /// return itself, for a summary index this will return the id recorded in the
708 /// index for that stack id index value).
709 uint64_t getStackId(uint64_t IdOrIndex) const {
710 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
711 }
712
713 /// Returns true if the given call targets the callee of the given edge, or if
714 /// we were able to identify the call chain through intermediate tail calls.
715 /// In the latter case new context nodes are added to the graph for the
716 /// identified tail calls, and their synthesized nodes are added to
717 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
718 /// the updated edges and to prepare it for an increment in the caller.
719 bool
720 calleesMatch(CallTy Call, EdgeIter &EI,
721 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
722
723 // Return the callee function of the given call, or nullptr if it can't be
724 // determined
725 const FuncTy *getCalleeFunc(CallTy Call) {
726 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
727 }
728
729 /// Returns true if the given call targets the given function, or if we were
730 /// able to identify the call chain through intermediate tail calls (in which
731 /// case FoundCalleeChain will be populated).
732 bool calleeMatchesFunc(
733 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
734 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
735 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
736 Call, Func, CallerFunc, FoundCalleeChain);
737 }
738
739 /// Returns true if both call instructions have the same callee.
740 bool sameCallee(CallTy Call1, CallTy Call2) {
741 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
742 }
743
744 /// Get a list of nodes corresponding to the stack ids in the given
745 /// callsite's context.
746 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
747 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
748 Call);
749 }
750
751 /// Get the last stack id in the context for callsite.
752 uint64_t getLastStackId(CallTy Call) {
753 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
754 }
755
756 /// Update the allocation call to record type of allocated memory.
757 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
758 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
759 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
760 }
761
762 /// Get the AllocationType assigned to the given allocation instruction clone.
763 AllocationType getAllocationCallType(const CallInfo &Call) const {
764 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
765 }
766
767 /// Update non-allocation call to invoke (possibly cloned) function
768 /// CalleeFunc.
769 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
770 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
771 }
772
773 /// Clone the given function for the given callsite, recording mapping of all
774 /// of the functions tracked calls to their new versions in the CallMap.
775 /// Assigns new clones to clone number CloneNo.
776 FuncInfo cloneFunctionForCallsite(
777 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
778 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
779 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
780 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
781 }
782
783 /// Gets a label to use in the dot graph for the given call clone in the given
784 /// function.
785 std::string getLabel(const FuncTy *Func, const CallTy Call,
786 unsigned CloneNo) const {
787 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
788 }
789
790 // Create and return a new ContextNode.
791 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
792 CallInfo C = CallInfo()) {
793 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
794 auto *NewNode = NodeOwner.back().get();
795 if (F)
796 NodeToCallingFunc[NewNode] = F;
797 NewNode->NodeId = NodeOwner.size();
798 return NewNode;
799 }
800
801 /// Helpers to find the node corresponding to the given call or stackid.
802 ContextNode *getNodeForInst(const CallInfo &C);
803 ContextNode *getNodeForAlloc(const CallInfo &C);
804 ContextNode *getNodeForStackId(uint64_t StackId);
805
806 /// Computes the alloc type corresponding to the given context ids, by
807 /// unioning their recorded alloc types.
808 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
809
810 /// Returns the allocation type of the intersection of the contexts of two
811 /// nodes (based on their provided context id sets), optimized for the case
812 /// when Node1Ids is smaller than Node2Ids.
813 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
814 const DenseSet<uint32_t> &Node2Ids) const;
815
816 /// Returns the allocation type of the intersection of the contexts of two
817 /// nodes (based on their provided context id sets).
818 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
819 const DenseSet<uint32_t> &Node2Ids) const;
820
821 /// Create a clone of Edge's callee and move Edge to that new callee node,
822 /// performing the necessary context id and allocation type updates.
823 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
824 /// moved to an edge to the new callee.
825 ContextNode *
826 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
827 DenseSet<uint32_t> ContextIdsToMove = {});
828
829 /// Change the callee of Edge to existing callee clone NewCallee, performing
830 /// the necessary context id and allocation type updates.
831 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
832 /// moved to an edge to the new callee.
833 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
834 ContextNode *NewCallee,
835 bool NewClone = false,
836 DenseSet<uint32_t> ContextIdsToMove = {});
837
838 /// Change the caller of the edge at the given callee edge iterator to be
839 /// NewCaller, performing the necessary context id and allocation type
840 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
841 /// a simplified version of it as we always move the given edge and all of its
842 /// context ids.
843 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
844 ContextNode *NewCaller);
845
846 /// Recursive helper for marking backedges via DFS.
847 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
848 DenseSet<const ContextNode *> &CurrentStack);
849
850 /// Recursive helper for merging clones.
851 void
852 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
853 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
854 /// Main worker for merging callee clones for a given node.
855 void mergeNodeCalleeClones(
856 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
857 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
858 /// Helper to find other callers of the given set of callee edges that can
859 /// share the same callee merge node.
860 void findOtherCallersToShareMerge(
861 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
862 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
863 DenseSet<ContextNode *> &OtherCallersToShareMerge);
864
865 /// Recursively perform cloning on the graph for the given Node and its
866 /// callers, in order to uniquely identify the allocation behavior of an
867 /// allocation given its context. The context ids of the allocation being
868 /// processed are given in AllocContextIds.
869 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
870 const DenseSet<uint32_t> &AllocContextIds);
871
872 /// Map from each context ID to the AllocationType assigned to that context.
873 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
874
875 /// Map from each contextID to the profiled full contexts and their total
876 /// sizes (there may be more than one due to context trimming),
877 /// optionally populated when requested (via MemProfReportHintedSizes or
878 /// MinClonedColdBytePercent).
879 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
880
881 /// Identifies the context node created for a stack id when adding the MIB
882 /// contexts to the graph. This is used to locate the context nodes when
883 /// trying to assign the corresponding callsites with those stack ids to these
884 /// nodes.
885 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
886
887 /// Saves information for the contexts identified as important (the largest
888 /// cold contexts up to MemProfTopNImportant).
889 struct ImportantContextInfo {
890 // The original list of leaf first stack ids corresponding to this context.
891 std::vector<uint64_t> StackIds;
892 // Max length of stack ids corresponding to a single stack ContextNode for
893 // this context (i.e. the max length of a key in StackIdsToNode below).
894 unsigned MaxLength = 0;
895 // Mapping of slices of the stack ids to the corresponding ContextNode
896 // (there can be multiple stack ids due to inlining). Populated when
897 // updating stack nodes while matching them to the IR or summary.
898 std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
899 };
900
901 // Map of important full context ids to information about each.
902 DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
903
904 // For each important context id found in Node (if any), records the list of
905 // stack ids that corresponded to the given callsite Node. There can be more
906 // than one in the case of inlining.
907 void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
908 // We pass in the Node's context ids to avoid the
909 // overhead of computing them as the caller already has
910 // them in some cases.
911 const DenseSet<uint32_t> &NodeContextIds,
912 const DenseSet<uint32_t> &ImportantContextIds) {
914 assert(ImportantContextIds.empty());
915 return;
916 }
918 set_intersection(NodeContextIds, ImportantContextIds);
919 if (Ids.empty())
920 return;
921 auto Size = StackIds.size();
922 for (auto Id : Ids) {
923 auto &Entry = ImportantContextIdInfo[Id];
924 Entry.StackIdsToNode[StackIds] = Node;
925 // Keep track of the max to simplify later analysis.
926 if (Size > Entry.MaxLength)
927 Entry.MaxLength = Size;
928 }
929 }
930
931 /// Maps to track the calls to their corresponding nodes in the graph.
932 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
933 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
934
935 /// Owner of all ContextNode unique_ptrs.
936 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
937
938 /// Perform sanity checks on graph when requested.
939 void check() const;
940
941 /// Keeps track of the last unique context id assigned.
942 unsigned int LastContextId = 0;
943};
944
945template <typename DerivedCCG, typename FuncTy, typename CallTy>
946using ContextNode =
947 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
948template <typename DerivedCCG, typename FuncTy, typename CallTy>
949using ContextEdge =
950 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
951template <typename DerivedCCG, typename FuncTy, typename CallTy>
952using FuncInfo =
953 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
954template <typename DerivedCCG, typename FuncTy, typename CallTy>
955using CallInfo =
956 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
957
958/// CRTP derived class for graphs built from IR (regular LTO).
959class ModuleCallsiteContextGraph
960 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
961 Instruction *> {
962public:
963 ModuleCallsiteContextGraph(
964 Module &M,
965 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
966
967private:
968 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
969 Instruction *>;
970
971 uint64_t getStackId(uint64_t IdOrIndex) const;
972 const Function *getCalleeFunc(Instruction *Call);
973 bool calleeMatchesFunc(
974 Instruction *Call, const Function *Func, const Function *CallerFunc,
975 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
976 bool sameCallee(Instruction *Call1, Instruction *Call2);
977 bool findProfiledCalleeThroughTailCalls(
978 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
979 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
980 bool &FoundMultipleCalleeChains);
981 uint64_t getLastStackId(Instruction *Call);
982 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
983 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
984 AllocationType getAllocationCallType(const CallInfo &Call) const;
985 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
986 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
987 Instruction *>::FuncInfo
988 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
989 DenseMap<CallInfo, CallInfo> &CallMap,
990 std::vector<CallInfo> &CallsWithMetadataInFunc,
991 unsigned CloneNo);
992 std::string getLabel(const Function *Func, const Instruction *Call,
993 unsigned CloneNo) const;
994
995 const Module &Mod;
996 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
997};
998
999/// Represents a call in the summary index graph, which can either be an
1000/// allocation or an interior callsite node in an allocation's context.
1001/// Holds a pointer to the corresponding data structure in the index.
1002struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
1003 IndexCall() : PointerUnion() {}
1004 IndexCall(std::nullptr_t) : IndexCall() {}
1005 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
1006 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
1007 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
1008
1009 IndexCall *operator->() { return this; }
1010
1011 void print(raw_ostream &OS) const {
1012 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
1014 OS << *AI;
1015 } else {
1017 assert(CI);
1018 OS << *CI;
1019 }
1020 }
1021};
1022} // namespace
1023
1024namespace llvm {
1025template <> struct simplify_type<IndexCall> {
1027 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
1028};
1029template <> struct simplify_type<const IndexCall> {
1031 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
1032};
1033} // namespace llvm
1034
1035namespace {
1036/// CRTP derived class for graphs built from summary index (ThinLTO).
1037class IndexCallsiteContextGraph
1038 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1039 IndexCall> {
1040public:
1041 IndexCallsiteContextGraph(
1042 ModuleSummaryIndex &Index,
1043 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1044 isPrevailing);
1045
1046 ~IndexCallsiteContextGraph() {
1047 // Now that we are done with the graph it is safe to add the new
1048 // CallsiteInfo structs to the function summary vectors. The graph nodes
1049 // point into locations within these vectors, so we don't want to add them
1050 // any earlier.
1051 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
1052 auto *FS = I.first;
1053 for (auto &Callsite : I.second)
1054 FS->addCallsite(*Callsite.second);
1055 }
1056 }
1057
1058private:
1059 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1060 IndexCall>;
1061
1062 uint64_t getStackId(uint64_t IdOrIndex) const;
1063 const FunctionSummary *getCalleeFunc(IndexCall &Call);
1064 bool calleeMatchesFunc(
1065 IndexCall &Call, const FunctionSummary *Func,
1066 const FunctionSummary *CallerFunc,
1067 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
1068 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1069 bool findProfiledCalleeThroughTailCalls(
1070 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1071 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1072 bool &FoundMultipleCalleeChains);
1073 uint64_t getLastStackId(IndexCall &Call);
1074 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1075 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1076 AllocationType getAllocationCallType(const CallInfo &Call) const;
1077 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1078 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1079 IndexCall>::FuncInfo
1080 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1081 DenseMap<CallInfo, CallInfo> &CallMap,
1082 std::vector<CallInfo> &CallsWithMetadataInFunc,
1083 unsigned CloneNo);
1084 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1085 unsigned CloneNo) const;
1086
1087 // Saves mapping from function summaries containing memprof records back to
1088 // its VI, for use in checking and debugging.
1089 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1090
1091 const ModuleSummaryIndex &Index;
1092 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1093 isPrevailing;
1094
1095 // Saves/owns the callsite info structures synthesized for missing tail call
1096 // frames that we discover while building the graph.
1097 // It maps from the summary of the function making the tail call, to a map
1098 // of callee ValueInfo to corresponding synthesized callsite info.
1099 std::unordered_map<FunctionSummary *,
1100 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1101 FunctionCalleesToSynthesizedCallsiteInfos;
1102};
1103} // namespace
1104
1105template <>
1106struct llvm::DenseMapInfo<CallsiteContextGraph<
1107 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1109template <>
1110struct llvm::DenseMapInfo<CallsiteContextGraph<
1111 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1112 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1113template <>
1114struct llvm::DenseMapInfo<IndexCall>
1115 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1116
1117namespace {
1118
1119// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1120// type we should actually use on the corresponding allocation.
1121// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1122// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1123// from NotCold.
1124AllocationType allocTypeToUse(uint8_t AllocTypes) {
1125 assert(AllocTypes != (uint8_t)AllocationType::None);
1126 if (AllocTypes ==
1129 else
1130 return (AllocationType)AllocTypes;
1131}
1132
1133// Helper to check if the alloc types for all edges recorded in the
1134// InAllocTypes vector match the alloc types for all edges in the Edges
1135// vector.
1136template <typename DerivedCCG, typename FuncTy, typename CallTy>
1137bool allocTypesMatch(
1138 const std::vector<uint8_t> &InAllocTypes,
1139 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1140 &Edges) {
1141 // This should be called only when the InAllocTypes vector was computed for
1142 // this set of Edges. Make sure the sizes are the same.
1143 assert(InAllocTypes.size() == Edges.size());
1144 return std::equal(
1145 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1146 [](const uint8_t &l,
1147 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1148 // Can share if one of the edges is None type - don't
1149 // care about the type along that edge as it doesn't
1150 // exist for those context ids.
1151 if (l == (uint8_t)AllocationType::None ||
1152 r->AllocTypes == (uint8_t)AllocationType::None)
1153 return true;
1154 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1155 });
1156}
1157
1158// Helper to check if the alloc types for all edges recorded in the
1159// InAllocTypes vector match the alloc types for callee edges in the given
1160// clone. Because the InAllocTypes were computed from the original node's callee
1161// edges, and other cloning could have happened after this clone was created, we
1162// need to find the matching clone callee edge, which may or may not exist.
1163template <typename DerivedCCG, typename FuncTy, typename CallTy>
1164bool allocTypesMatchClone(
1165 const std::vector<uint8_t> &InAllocTypes,
1166 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1167 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1168 assert(Node);
1169 // InAllocTypes should have been computed for the original node's callee
1170 // edges.
1171 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1172 // First create a map of the clone callee edge callees to the edge alloc type.
1174 EdgeCalleeMap;
1175 for (const auto &E : Clone->CalleeEdges) {
1176 assert(!EdgeCalleeMap.contains(E->Callee));
1177 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1178 }
1179 // Next, walk the original node's callees, and look for the corresponding
1180 // clone edge to that callee.
1181 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1182 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1183 // Not found is ok, we will simply add an edge if we use this clone.
1184 if (Iter == EdgeCalleeMap.end())
1185 continue;
1186 // Can share if one of the edges is None type - don't
1187 // care about the type along that edge as it doesn't
1188 // exist for those context ids.
1189 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1190 Iter->second == (uint8_t)AllocationType::None)
1191 continue;
1192 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1193 return false;
1194 }
1195 return true;
1196}
1197
1198} // end anonymous namespace
1199
1200template <typename DerivedCCG, typename FuncTy, typename CallTy>
1201typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1202CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1203 const CallInfo &C) {
1204 ContextNode *Node = getNodeForAlloc(C);
1205 if (Node)
1206 return Node;
1207
1208 return NonAllocationCallToContextNodeMap.lookup(C);
1209}
1210
1211template <typename DerivedCCG, typename FuncTy, typename CallTy>
1212typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1213CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1214 const CallInfo &C) {
1215 return AllocationCallToContextNodeMap.lookup(C);
1216}
1217
1218template <typename DerivedCCG, typename FuncTy, typename CallTy>
1219typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1220CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1221 uint64_t StackId) {
1222 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1223 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1224 return StackEntryNode->second;
1225 return nullptr;
1226}
1227
1228template <typename DerivedCCG, typename FuncTy, typename CallTy>
1229void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1230 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1231 unsigned int ContextId) {
1232 for (auto &Edge : CallerEdges) {
1233 if (Edge->Caller == Caller) {
1234 Edge->AllocTypes |= (uint8_t)AllocType;
1235 Edge->getContextIds().insert(ContextId);
1236 return;
1237 }
1238 }
1239 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1240 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1241 CallerEdges.push_back(Edge);
1242 Caller->CalleeEdges.push_back(Edge);
1243}
1244
1245template <typename DerivedCCG, typename FuncTy, typename CallTy>
1246void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1247 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1248 assert(!EI || (*EI)->get() == Edge);
1249 assert(!Edge->isRemoved());
1250 // Save the Caller and Callee pointers so we can erase Edge from their edge
1251 // lists after clearing Edge below. We do the clearing first in case it is
1252 // destructed after removing from the edge lists (if those were the last
1253 // shared_ptr references to Edge).
1254 auto *Callee = Edge->Callee;
1255 auto *Caller = Edge->Caller;
1256
1257 // Make sure the edge fields are cleared out so we can properly detect
1258 // removed edges if Edge is not destructed because there is still a shared_ptr
1259 // reference.
1260 Edge->clear();
1261
1262#ifndef NDEBUG
1263 auto CalleeCallerCount = Callee->CallerEdges.size();
1264 auto CallerCalleeCount = Caller->CalleeEdges.size();
1265#endif
1266 if (!EI) {
1267 Callee->eraseCallerEdge(Edge);
1268 Caller->eraseCalleeEdge(Edge);
1269 } else if (CalleeIter) {
1270 Callee->eraseCallerEdge(Edge);
1271 *EI = Caller->CalleeEdges.erase(*EI);
1272 } else {
1273 Caller->eraseCalleeEdge(Edge);
1274 *EI = Callee->CallerEdges.erase(*EI);
1275 }
1276 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1277 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1278}
1279
1280template <typename DerivedCCG, typename FuncTy, typename CallTy>
1281void CallsiteContextGraph<
1282 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1283 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1284 auto Edge = *EI;
1285 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1286 assert(Edge->ContextIds.empty());
1287 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1288 } else
1289 ++EI;
1290 }
1291}
1292
1293template <typename DerivedCCG, typename FuncTy, typename CallTy>
1294void CallsiteContextGraph<
1295 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1296 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1297 auto Edge = *EI;
1298 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1299 assert(Edge->ContextIds.empty());
1300 Edge->Caller->eraseCalleeEdge(Edge.get());
1301 EI = Node->CallerEdges.erase(EI);
1302 } else
1303 ++EI;
1304 }
1305}
1306
1307template <typename DerivedCCG, typename FuncTy, typename CallTy>
1308typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1309CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1310 findEdgeFromCallee(const ContextNode *Callee) {
1311 for (const auto &Edge : CalleeEdges)
1312 if (Edge->Callee == Callee)
1313 return Edge.get();
1314 return nullptr;
1315}
1316
1317template <typename DerivedCCG, typename FuncTy, typename CallTy>
1318typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1319CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1320 findEdgeFromCaller(const ContextNode *Caller) {
1321 for (const auto &Edge : CallerEdges)
1322 if (Edge->Caller == Caller)
1323 return Edge.get();
1324 return nullptr;
1325}
1326
1327template <typename DerivedCCG, typename FuncTy, typename CallTy>
1328void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1329 eraseCalleeEdge(const ContextEdge *Edge) {
1330 auto EI = llvm::find_if(
1331 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1332 return CalleeEdge.get() == Edge;
1333 });
1334 assert(EI != CalleeEdges.end());
1335 CalleeEdges.erase(EI);
1336}
1337
1338template <typename DerivedCCG, typename FuncTy, typename CallTy>
1339void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1340 eraseCallerEdge(const ContextEdge *Edge) {
1341 auto EI = llvm::find_if(
1342 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1343 return CallerEdge.get() == Edge;
1344 });
1345 assert(EI != CallerEdges.end());
1346 CallerEdges.erase(EI);
1347}
1348
1349template <typename DerivedCCG, typename FuncTy, typename CallTy>
1350uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1351 DenseSet<uint32_t> &ContextIds) const {
1352 uint8_t BothTypes =
1353 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1354 uint8_t AllocType = (uint8_t)AllocationType::None;
1355 for (auto Id : ContextIds) {
1356 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1357 // Bail early if alloc type reached both, no further refinement.
1358 if (AllocType == BothTypes)
1359 return AllocType;
1360 }
1361 return AllocType;
1362}
1363
1364template <typename DerivedCCG, typename FuncTy, typename CallTy>
1365uint8_t
1366CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1367 const DenseSet<uint32_t> &Node1Ids,
1368 const DenseSet<uint32_t> &Node2Ids) const {
1369 uint8_t BothTypes =
1370 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1371 uint8_t AllocType = (uint8_t)AllocationType::None;
1372 for (auto Id : Node1Ids) {
1373 if (!Node2Ids.count(Id))
1374 continue;
1375 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1376 // Bail early if alloc type reached both, no further refinement.
1377 if (AllocType == BothTypes)
1378 return AllocType;
1379 }
1380 return AllocType;
1381}
1382
1383template <typename DerivedCCG, typename FuncTy, typename CallTy>
1384uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1385 const DenseSet<uint32_t> &Node1Ids,
1386 const DenseSet<uint32_t> &Node2Ids) const {
1387 if (Node1Ids.size() < Node2Ids.size())
1388 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1389 else
1390 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1391}
1392
1393template <typename DerivedCCG, typename FuncTy, typename CallTy>
1394typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1395CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1396 CallInfo Call, const FuncTy *F) {
1397 assert(!getNodeForAlloc(Call));
1398 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1399 AllocationCallToContextNodeMap[Call] = AllocNode;
1400 // Use LastContextId as a uniq id for MIB allocation nodes.
1401 AllocNode->OrigStackOrAllocId = LastContextId;
1402 // Alloc type should be updated as we add in the MIBs. We should assert
1403 // afterwards that it is not still None.
1404 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1405
1406 return AllocNode;
1407}
1408
1409static std::string getAllocTypeString(uint8_t AllocTypes) {
1410 if (!AllocTypes)
1411 return "None";
1412 std::string Str;
1413 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1414 Str += "NotCold";
1415 if (AllocTypes & (uint8_t)AllocationType::Cold)
1416 Str += "Cold";
1417 return Str;
1418}
1419
1420template <typename DerivedCCG, typename FuncTy, typename CallTy>
1421template <class NodeT, class IteratorT>
1422void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1423 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1424 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1425 ArrayRef<ContextTotalSize> ContextSizeInfo,
1426 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
1427 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1428 // is done.
1429 if (AllocType == AllocationType::Hot)
1430 AllocType = AllocationType::NotCold;
1431
1432 ContextIdToAllocationType[++LastContextId] = AllocType;
1433
1434 bool IsImportant = false;
1435 if (!ContextSizeInfo.empty()) {
1436 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1437 // If this is a cold allocation, and we are collecting non-zero largest
1438 // contexts, see if this is a candidate.
1439 if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
1440 uint64_t TotalCold = 0;
1441 for (auto &CSI : ContextSizeInfo)
1442 TotalCold += CSI.TotalSize;
1443 // Record this context if either we haven't found the first top-n largest
1444 // yet, or if it is larger than the smallest already recorded.
1445 if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
1446 // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
1447 // sorted in ascending size of its key which is the size.
1448 TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
1449 if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
1450 // Remove old one and its associated entries.
1451 auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
1452 TotalSizeToContextIdTopNCold.erase(
1453 TotalSizeToContextIdTopNCold.begin());
1454 assert(ImportantContextIdInfo.count(IdToRemove));
1455 ImportantContextIdInfo.erase(IdToRemove);
1456 }
1457 TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
1458 IsImportant = true;
1459 }
1460 }
1461 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1462 }
1463
1464 // Update alloc type and context ids for this MIB.
1465 AllocNode->AllocTypes |= (uint8_t)AllocType;
1466
1467 // Now add or update nodes for each stack id in alloc's context.
1468 // Later when processing the stack ids on non-alloc callsites we will adjust
1469 // for any inlining in the context.
1470 ContextNode *PrevNode = AllocNode;
1471 // Look for recursion (direct recursion should have been collapsed by
1472 // module summary analysis, here we should just be detecting mutual
1473 // recursion). Mark these nodes so we don't try to clone.
1474 SmallSet<uint64_t, 8> StackIdSet;
1475 // Skip any on the allocation call (inlining).
1476 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1477 ContextIter != StackContext.end(); ++ContextIter) {
1478 auto StackId = getStackId(*ContextIter);
1479 if (IsImportant)
1480 ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
1481 ContextNode *StackNode = getNodeForStackId(StackId);
1482 if (!StackNode) {
1483 StackNode = createNewNode(/*IsAllocation=*/false);
1484 StackEntryIdToContextNodeMap[StackId] = StackNode;
1485 StackNode->OrigStackOrAllocId = StackId;
1486 }
1487 // Marking a node recursive will prevent its cloning completely, even for
1488 // non-recursive contexts flowing through it.
1490 auto Ins = StackIdSet.insert(StackId);
1491 if (!Ins.second)
1492 StackNode->Recursive = true;
1493 }
1494 StackNode->AllocTypes |= (uint8_t)AllocType;
1495 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1496 PrevNode = StackNode;
1497 }
1498}
1499
1500template <typename DerivedCCG, typename FuncTy, typename CallTy>
1501DenseSet<uint32_t>
1502CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1503 const DenseSet<uint32_t> &StackSequenceContextIds,
1504 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1505 DenseSet<uint32_t> NewContextIds;
1506 for (auto OldId : StackSequenceContextIds) {
1507 NewContextIds.insert(++LastContextId);
1508 OldToNewContextIds[OldId].insert(LastContextId);
1509 assert(ContextIdToAllocationType.count(OldId));
1510 // The new context has the same allocation type as original.
1511 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1512 if (DotAllocContextIds.contains(OldId))
1513 DotAllocContextIds.insert(LastContextId);
1514 }
1515 return NewContextIds;
1516}
1517
1518template <typename DerivedCCG, typename FuncTy, typename CallTy>
1519void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1520 propagateDuplicateContextIds(
1521 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1522 // Build a set of duplicated context ids corresponding to the input id set.
1523 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1524 DenseSet<uint32_t> NewIds;
1525 for (auto Id : ContextIds)
1526 if (auto NewId = OldToNewContextIds.find(Id);
1527 NewId != OldToNewContextIds.end())
1528 NewIds.insert_range(NewId->second);
1529 return NewIds;
1530 };
1531
1532 // Recursively update context ids sets along caller edges.
1533 auto UpdateCallers = [&](ContextNode *Node,
1534 DenseSet<const ContextEdge *> &Visited,
1535 auto &&UpdateCallers) -> void {
1536 for (const auto &Edge : Node->CallerEdges) {
1537 auto Inserted = Visited.insert(Edge.get());
1538 if (!Inserted.second)
1539 continue;
1540 ContextNode *NextNode = Edge->Caller;
1541 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1542 // Only need to recursively iterate to NextNode via this caller edge if
1543 // it resulted in any added ids to NextNode.
1544 if (!NewIdsToAdd.empty()) {
1545 Edge->getContextIds().insert_range(NewIdsToAdd);
1546 UpdateCallers(NextNode, Visited, UpdateCallers);
1547 }
1548 }
1549 };
1550
1551 DenseSet<const ContextEdge *> Visited;
1552 for (auto &Entry : AllocationCallToContextNodeMap) {
1553 auto *Node = Entry.second;
1554 UpdateCallers(Node, Visited, UpdateCallers);
1555 }
1556}
1557
1558template <typename DerivedCCG, typename FuncTy, typename CallTy>
1559void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1560 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1561 // This must be passed by value to make a copy since it will be adjusted
1562 // as ids are moved.
1563 DenseSet<uint32_t> RemainingContextIds) {
1564 auto &OrigEdges =
1565 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1566 DenseSet<uint32_t> RecursiveContextIds;
1567 DenseSet<uint32_t> AllCallerContextIds;
1569 // Identify which context ids are recursive which is needed to properly
1570 // update the RemainingContextIds set. The relevant recursive context ids
1571 // are those that are in multiple edges.
1572 for (auto &CE : OrigEdges) {
1573 AllCallerContextIds.reserve(CE->getContextIds().size());
1574 for (auto Id : CE->getContextIds())
1575 if (!AllCallerContextIds.insert(Id).second)
1576 RecursiveContextIds.insert(Id);
1577 }
1578 }
1579 // Increment iterator in loop so that we can remove edges as needed.
1580 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1581 auto Edge = *EI;
1582 DenseSet<uint32_t> NewEdgeContextIds;
1583 DenseSet<uint32_t> NotFoundContextIds;
1584 // Remove any matching context ids from Edge, return set that were found and
1585 // removed, these are the new edge's context ids. Also update the remaining
1586 // (not found ids).
1587 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1588 NotFoundContextIds);
1589 // Update the remaining context ids set for the later edges. This is a
1590 // compile time optimization.
1591 if (RecursiveContextIds.empty()) {
1592 // No recursive ids, so all of the previously remaining context ids that
1593 // were not seen on this edge are the new remaining set.
1594 RemainingContextIds.swap(NotFoundContextIds);
1595 } else {
1596 // Keep the recursive ids in the remaining set as we expect to see those
1597 // on another edge. We can remove the non-recursive remaining ids that
1598 // were seen on this edge, however. We already have the set of remaining
1599 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1600 // non-recursive and only remove those. Note that despite the higher
1601 // overhead of updating the remaining context ids set when recursion
1602 // handling is enabled, it was found to be at worst performance neutral
1603 // and in one case a clear win.
1604 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1605 set_difference(NewEdgeContextIds, RecursiveContextIds);
1606 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1607 }
1608 // If no matching context ids for this edge, skip it.
1609 if (NewEdgeContextIds.empty()) {
1610 ++EI;
1611 continue;
1612 }
1613 if (TowardsCallee) {
1614 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1615 auto NewEdge = std::make_shared<ContextEdge>(
1616 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1617 NewNode->CalleeEdges.push_back(NewEdge);
1618 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1619 } else {
1620 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1621 auto NewEdge = std::make_shared<ContextEdge>(
1622 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1623 NewNode->CallerEdges.push_back(NewEdge);
1624 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1625 }
1626 // Remove old edge if context ids empty.
1627 if (Edge->getContextIds().empty()) {
1628 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1629 continue;
1630 }
1631 ++EI;
1632 }
1633}
1634
1635template <typename DerivedCCG, typename FuncTy, typename CallTy>
1636static void checkEdge(
1637 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1638 // Confirm that alloc type is not None and that we have at least one context
1639 // id.
1640 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1641 assert(!Edge->ContextIds.empty());
1642}
1643
1644template <typename DerivedCCG, typename FuncTy, typename CallTy>
1645static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1646 bool CheckEdges = true) {
1647 if (Node->isRemoved())
1648 return;
1649#ifndef NDEBUG
1650 // Compute node's context ids once for use in asserts.
1651 auto NodeContextIds = Node->getContextIds();
1652#endif
1653 // Node's context ids should be the union of both its callee and caller edge
1654 // context ids.
1655 if (Node->CallerEdges.size()) {
1656 DenseSet<uint32_t> CallerEdgeContextIds(
1657 Node->CallerEdges.front()->ContextIds);
1658 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1659 if (CheckEdges)
1661 set_union(CallerEdgeContextIds, Edge->ContextIds);
1662 }
1663 // Node can have more context ids than callers if some contexts terminate at
1664 // node and some are longer. If we are allowing recursive callsites and
1665 // contexts this will be violated for incompletely cloned recursive cycles,
1666 // so skip the checking in that case.
1668 NodeContextIds == CallerEdgeContextIds ||
1669 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1670 }
1671 if (Node->CalleeEdges.size()) {
1672 DenseSet<uint32_t> CalleeEdgeContextIds(
1673 Node->CalleeEdges.front()->ContextIds);
1674 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1675 if (CheckEdges)
1677 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1678 }
1679 // If we are allowing recursive callsites and contexts this will be violated
1680 // for incompletely cloned recursive cycles, so skip the checking in that
1681 // case.
1683 NodeContextIds == CalleeEdgeContextIds);
1684 }
1685 // FIXME: Since this checking is only invoked under an option, we should
1686 // change the error checking from using assert to something that will trigger
1687 // an error on a release build.
1688#ifndef NDEBUG
1689 // Make sure we don't end up with duplicate edges between the same caller and
1690 // callee.
1692 for (const auto &E : Node->CalleeEdges)
1693 NodeSet.insert(E->Callee);
1694 assert(NodeSet.size() == Node->CalleeEdges.size());
1695#endif
1696}
1697
1698template <typename DerivedCCG, typename FuncTy, typename CallTy>
1699void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1700 assignStackNodesPostOrder(ContextNode *Node,
1701 DenseSet<const ContextNode *> &Visited,
1702 DenseMap<uint64_t, std::vector<CallContextInfo>>
1703 &StackIdToMatchingCalls,
1704 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
1705 const DenseSet<uint32_t> &ImportantContextIds) {
1706 auto Inserted = Visited.insert(Node);
1707 if (!Inserted.second)
1708 return;
1709 // Post order traversal. Iterate over a copy since we may add nodes and
1710 // therefore new callers during the recursive call, invalidating any
1711 // iterator over the original edge vector. We don't need to process these
1712 // new nodes as they were already processed on creation.
1713 auto CallerEdges = Node->CallerEdges;
1714 for (auto &Edge : CallerEdges) {
1715 // Skip any that have been removed during the recursion.
1716 if (Edge->isRemoved()) {
1717 assert(!is_contained(Node->CallerEdges, Edge));
1718 continue;
1719 }
1720 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1721 CallToMatchingCall, ImportantContextIds);
1722 }
1723
1724 // If this node's stack id is in the map, update the graph to contain new
1725 // nodes representing any inlining at interior callsites. Note we move the
1726 // associated context ids over to the new nodes.
1727
1728 // Ignore this node if it is for an allocation or we didn't record any
1729 // stack id lists ending at it.
1730 if (Node->IsAllocation ||
1731 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1732 return;
1733
1734 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1735 // Handle the simple case first. A single call with a single stack id.
1736 // In this case there is no need to create any new context nodes, simply
1737 // assign the context node for stack id to this Call.
1738 if (Calls.size() == 1) {
1739 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1740 if (Ids.size() == 1) {
1741 assert(SavedContextIds.empty());
1742 // It should be this Node
1743 assert(Node == getNodeForStackId(Ids[0]));
1744 if (Node->Recursive)
1745 return;
1746 Node->setCall(Call);
1747 NonAllocationCallToContextNodeMap[Call] = Node;
1748 NodeToCallingFunc[Node] = Func;
1749 recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
1750 return;
1751 }
1752 }
1753
1754#ifndef NDEBUG
1755 // Find the node for the last stack id, which should be the same
1756 // across all calls recorded for this id, and is this node's id.
1757 uint64_t LastId = Node->OrigStackOrAllocId;
1758 ContextNode *LastNode = getNodeForStackId(LastId);
1759 // We should only have kept stack ids that had nodes.
1760 assert(LastNode);
1761 assert(LastNode == Node);
1762#else
1763 ContextNode *LastNode = Node;
1764#endif
1765
1766 // Compute the last node's context ids once, as it is shared by all calls in
1767 // this entry.
1768 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1769
1770 [[maybe_unused]] bool PrevIterCreatedNode = false;
1771 bool CreatedNode = false;
1772 for (unsigned I = 0; I < Calls.size();
1773 I++, PrevIterCreatedNode = CreatedNode) {
1774 CreatedNode = false;
1775 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1776 // Skip any for which we didn't assign any ids, these don't get a node in
1777 // the graph.
1778 if (SavedContextIds.empty()) {
1779 // If this call has a matching call (located in the same function and
1780 // having the same stack ids), simply add it to the context node created
1781 // for its matching call earlier. These can be treated the same through
1782 // cloning and get updated at the same time.
1783 if (!CallToMatchingCall.contains(Call))
1784 continue;
1785 auto MatchingCall = CallToMatchingCall[Call];
1786 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1787 // This should only happen if we had a prior iteration, and it didn't
1788 // create a node because of the below recomputation of context ids
1789 // finding none remaining and continuing early.
1790 assert(I > 0 && !PrevIterCreatedNode);
1791 continue;
1792 }
1793 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1794 Call);
1795 continue;
1796 }
1797
1798 assert(LastId == Ids.back());
1799
1800 // Recompute the context ids for this stack id sequence (the
1801 // intersection of the context ids of the corresponding nodes).
1802 // Start with the ids we saved in the map for this call, which could be
1803 // duplicated context ids. We have to recompute as we might have overlap
1804 // overlap between the saved context ids for different last nodes, and
1805 // removed them already during the post order traversal.
1806 set_intersect(SavedContextIds, LastNodeContextIds);
1807 ContextNode *PrevNode = LastNode;
1808 bool Skip = false;
1809 // Iterate backwards through the stack Ids, starting after the last Id
1810 // in the list, which was handled once outside for all Calls.
1811 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1812 auto Id = *IdIter;
1813 ContextNode *CurNode = getNodeForStackId(Id);
1814 // We should only have kept stack ids that had nodes and weren't
1815 // recursive.
1816 assert(CurNode);
1817 assert(!CurNode->Recursive);
1818
1819 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1820 if (!Edge) {
1821 Skip = true;
1822 break;
1823 }
1824 PrevNode = CurNode;
1825
1826 // Update the context ids, which is the intersection of the ids along
1827 // all edges in the sequence.
1828 set_intersect(SavedContextIds, Edge->getContextIds());
1829
1830 // If we now have no context ids for clone, skip this call.
1831 if (SavedContextIds.empty()) {
1832 Skip = true;
1833 break;
1834 }
1835 }
1836 if (Skip)
1837 continue;
1838
1839 // Create new context node.
1840 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1841 NonAllocationCallToContextNodeMap[Call] = NewNode;
1842 CreatedNode = true;
1843 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1844
1845 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1846 assert(FirstNode);
1847
1848 // Connect to callees of innermost stack frame in inlined call chain.
1849 // This updates context ids for FirstNode's callee's to reflect those
1850 // moved to NewNode.
1851 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1852
1853 // Connect to callers of outermost stack frame in inlined call chain.
1854 // This updates context ids for FirstNode's caller's to reflect those
1855 // moved to NewNode.
1856 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1857
1858 // Now we need to remove context ids from edges/nodes between First and
1859 // Last Node.
1860 PrevNode = nullptr;
1861 for (auto Id : Ids) {
1862 ContextNode *CurNode = getNodeForStackId(Id);
1863 // We should only have kept stack ids that had nodes.
1864 assert(CurNode);
1865
1866 // Remove the context ids moved to NewNode from CurNode, and the
1867 // edge from the prior node.
1868 if (PrevNode) {
1869 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1870 // If the sequence contained recursion, we might have already removed
1871 // some edges during the connectNewNode calls above.
1872 if (!PrevEdge) {
1873 PrevNode = CurNode;
1874 continue;
1875 }
1876 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1877 if (PrevEdge->getContextIds().empty())
1878 removeEdgeFromGraph(PrevEdge);
1879 }
1880 // Since we update the edges from leaf to tail, only look at the callee
1881 // edges. This isn't an alloc node, so if there are no callee edges, the
1882 // alloc type is None.
1883 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1884 ? (uint8_t)AllocationType::None
1885 : CurNode->computeAllocType();
1886 PrevNode = CurNode;
1887 }
1888
1889 recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
1890
1891 if (VerifyNodes) {
1892 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1893 for (auto Id : Ids) {
1894 ContextNode *CurNode = getNodeForStackId(Id);
1895 // We should only have kept stack ids that had nodes.
1896 assert(CurNode);
1897 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1898 }
1899 }
1900 }
1901}
1902
1903template <typename DerivedCCG, typename FuncTy, typename CallTy>
1904void CallsiteContextGraph<DerivedCCG, FuncTy,
1905 CallTy>::fixupImportantContexts() {
1906 if (ImportantContextIdInfo.empty())
1907 return;
1908
1909 // Update statistics as we are done building this map at this point.
1910 NumImportantContextIds = ImportantContextIdInfo.size();
1911
1913 return;
1914
1915 if (ExportToDot)
1916 exportToDot("beforestackfixup");
1917
1918 // For each context we identified as important, walk through the saved context
1919 // stack ids in order from leaf upwards, and make sure all edges are correct.
1920 // These can be difficult to get right when updating the graph while mapping
1921 // nodes onto summary or IR, especially when there is recursion. In
1922 // particular, when we have created new nodes to reflect inlining, it is
1923 // sometimes impossible to know exactly how to update the edges in the face of
1924 // recursion, as we have lost the original ordering of the stack ids in the
1925 // contexts.
1926 // TODO: Consider only doing this if we detect the context has recursive
1927 // cycles.
1928 //
1929 // I.e. assume we have a context with stack ids like: {A B A C A D E}
1930 // and let's say A was inlined into B, C, and D. The original graph will have
1931 // multiple recursive cycles through A. When we match the original context
1932 // nodes onto the IR or summary, we will merge {A B} into one context node,
1933 // {A C} onto another, and {A D} onto another. Looking at the stack sequence
1934 // above, we should end up with a non-cyclic set of edges like:
1935 // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
1936 // original ordering, we won't get the edges correct initially (it's
1937 // impossible without the original ordering). Here we do the fixup (add and
1938 // removing edges where necessary) for this context. In the
1939 // ImportantContextInfo struct in this case we should have a MaxLength = 2,
1940 // and map entries for {A B}, {A C}, {A D}, and {E}.
1941 for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
1942 if (Info.StackIdsToNode.empty())
1943 continue;
1944 bool Changed = false;
1945 ContextNode *PrevNode = nullptr;
1946 ContextNode *CurNode = nullptr;
1947 DenseSet<const ContextEdge *> VisitedEdges;
1948 ArrayRef<uint64_t> AllStackIds(Info.StackIds);
1949 // Try to identify what callsite ContextNode maps to which slice of the
1950 // context's ordered stack ids.
1951 for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
1952 // We will do this greedily, trying up to MaxLength stack ids in a row, to
1953 // see if we recorded a context node for that sequence.
1954 auto Len = Info.MaxLength;
1955 auto LenToEnd = AllStackIds.size() - I;
1956 if (Len > LenToEnd)
1957 Len = LenToEnd;
1958 CurNode = nullptr;
1959 // Try to find a recorded context node starting with the longest length
1960 // recorded, and on down until we check for just a single stack node.
1961 for (; Len > 0; Len--) {
1962 // Get the slice of the original stack id sequence to check.
1963 auto CheckStackIds = AllStackIds.slice(I, Len);
1964 auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
1965 if (EntryIt == Info.StackIdsToNode.end())
1966 continue;
1967 CurNode = EntryIt->second;
1968 // Skip forward so we don't try to look for the ones we just matched.
1969 // We increment by Len - 1, because the outer for loop will increment I.
1970 I += Len - 1;
1971 break;
1972 }
1973 // Give up if we couldn't find a node. Since we need to clone from the
1974 // leaf allocation upwards, no sense in doing anymore fixup further up
1975 // the context if we couldn't match part of the original stack context
1976 // onto a callsite node.
1977 if (!CurNode)
1978 break;
1979 // No edges to fix up until we have a pair of nodes that should be
1980 // adjacent in the graph.
1981 if (!PrevNode)
1982 continue;
1983 // See if we already have a call edge from CurNode to PrevNode.
1984 auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
1985 if (CurEdge) {
1986 // We already have an edge. Make sure it contains this context id.
1987 if (CurEdge->getContextIds().insert(CurContextId).second) {
1988 NumFixupEdgeIdsInserted++;
1989 Changed = true;
1990 }
1991 } else {
1992 // No edge exists - add one.
1993 NumFixupEdgesAdded++;
1994 DenseSet<uint32_t> ContextIds({CurContextId});
1995 auto AllocType = computeAllocType(ContextIds);
1996 auto NewEdge = std::make_shared<ContextEdge>(
1997 PrevNode, CurNode, AllocType, std::move(ContextIds));
1998 PrevNode->CallerEdges.push_back(NewEdge);
1999 CurNode->CalleeEdges.push_back(NewEdge);
2000 // Save the new edge for the below handling.
2001 CurEdge = NewEdge.get();
2002 Changed = true;
2003 }
2004 VisitedEdges.insert(CurEdge);
2005 // Now remove this context id from any other caller edges calling
2006 // PrevNode.
2007 for (auto &Edge : PrevNode->CallerEdges) {
2008 // Skip the edge updating/created above and edges we have already
2009 // visited (due to recursion).
2010 if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
2011 Edge->getContextIds().erase(CurContextId);
2012 }
2013 }
2014 if (Changed)
2015 NumFixedContexts++;
2016 }
2017}
2018
2019template <typename DerivedCCG, typename FuncTy, typename CallTy>
2020void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
2021 // Map of stack id to all calls with that as the last (outermost caller)
2022 // callsite id that has a context node (some might not due to pruning
2023 // performed during matching of the allocation profile contexts).
2024 // The CallContextInfo contains the Call and a list of its stack ids with
2025 // ContextNodes, the function containing Call, and the set of context ids
2026 // the analysis will eventually identify for use in any new node created
2027 // for that callsite.
2028 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
2029 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2030 for (auto &Call : CallsWithMetadata) {
2031 // Ignore allocations, already handled.
2032 if (AllocationCallToContextNodeMap.count(Call))
2033 continue;
2034 auto StackIdsWithContextNodes =
2035 getStackIdsWithContextNodesForCall(Call.call());
2036 // If there were no nodes created for MIBs on allocs (maybe this was in
2037 // the unambiguous part of the MIB stack that was pruned), ignore.
2038 if (StackIdsWithContextNodes.empty())
2039 continue;
2040 // Otherwise, record this Call along with the list of ids for the last
2041 // (outermost caller) stack id with a node.
2042 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
2043 {Call.call(), StackIdsWithContextNodes, Func, {}});
2044 }
2045 }
2046
2047 // First make a pass through all stack ids that correspond to a call,
2048 // as identified in the above loop. Compute the context ids corresponding to
2049 // each of these calls when they correspond to multiple stack ids due to
2050 // due to inlining. Perform any duplication of context ids required when
2051 // there is more than one call with the same stack ids. Their (possibly newly
2052 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
2053 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
2054 // Save a map from each call to any that are found to match it. I.e. located
2055 // in the same function and have the same (possibly pruned) stack ids. We use
2056 // this to avoid creating extra graph nodes as they can be treated the same.
2057 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
2058 for (auto &It : StackIdToMatchingCalls) {
2059 auto &Calls = It.getSecond();
2060 // Skip single calls with a single stack id. These don't need a new node.
2061 if (Calls.size() == 1) {
2062 auto &Ids = Calls[0].StackIds;
2063 if (Ids.size() == 1)
2064 continue;
2065 }
2066 // In order to do the best and maximal matching of inlined calls to context
2067 // node sequences we will sort the vectors of stack ids in descending order
2068 // of length, and within each length, lexicographically by stack id. The
2069 // latter is so that we can specially handle calls that have identical stack
2070 // id sequences (either due to cloning or artificially because of the MIB
2071 // context pruning). Those with the same Ids are then sorted by function to
2072 // facilitate efficiently mapping them to the same context node.
2073 // Because the functions are pointers, to ensure a stable sort first assign
2074 // each function pointer to its first index in the Calls array, and then use
2075 // that to sort by.
2076 DenseMap<const FuncTy *, unsigned> FuncToIndex;
2077 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
2078 FuncToIndex.insert({CallCtxInfo.Func, Idx});
2080 Calls,
2081 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
2082 return A.StackIds.size() > B.StackIds.size() ||
2083 (A.StackIds.size() == B.StackIds.size() &&
2084 (A.StackIds < B.StackIds ||
2085 (A.StackIds == B.StackIds &&
2086 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
2087 });
2088
2089 // Find the node for the last stack id, which should be the same
2090 // across all calls recorded for this id, and is the id for this
2091 // entry in the StackIdToMatchingCalls map.
2092 uint64_t LastId = It.getFirst();
2093 ContextNode *LastNode = getNodeForStackId(LastId);
2094 // We should only have kept stack ids that had nodes.
2095 assert(LastNode);
2096
2097 if (LastNode->Recursive)
2098 continue;
2099
2100 // Initialize the context ids with the last node's. We will subsequently
2101 // refine the context ids by computing the intersection along all edges.
2102 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
2103 assert(!LastNodeContextIds.empty());
2104
2105#ifndef NDEBUG
2106 // Save the set of functions seen for a particular set of the same stack
2107 // ids. This is used to ensure that they have been correctly sorted to be
2108 // adjacent in the Calls list, since we rely on that to efficiently place
2109 // all such matching calls onto the same context node.
2110 DenseSet<const FuncTy *> MatchingIdsFuncSet;
2111#endif
2112
2113 for (unsigned I = 0; I < Calls.size(); I++) {
2114 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
2115 assert(SavedContextIds.empty());
2116 assert(LastId == Ids.back());
2117
2118#ifndef NDEBUG
2119 // If this call has a different set of ids than the last one, clear the
2120 // set used to ensure they are sorted properly.
2121 if (I > 0 && Ids != Calls[I - 1].StackIds)
2122 MatchingIdsFuncSet.clear();
2123#endif
2124
2125 // First compute the context ids for this stack id sequence (the
2126 // intersection of the context ids of the corresponding nodes).
2127 // Start with the remaining saved ids for the last node.
2128 assert(!LastNodeContextIds.empty());
2129 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
2130
2131 ContextNode *PrevNode = LastNode;
2132 ContextNode *CurNode = LastNode;
2133 bool Skip = false;
2134
2135 // Iterate backwards through the stack Ids, starting after the last Id
2136 // in the list, which was handled once outside for all Calls.
2137 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
2138 auto Id = *IdIter;
2139 CurNode = getNodeForStackId(Id);
2140 // We should only have kept stack ids that had nodes.
2141 assert(CurNode);
2142
2143 if (CurNode->Recursive) {
2144 Skip = true;
2145 break;
2146 }
2147
2148 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
2149 // If there is no edge then the nodes belong to different MIB contexts,
2150 // and we should skip this inlined context sequence. For example, this
2151 // particular inlined context may include stack ids A->B, and we may
2152 // indeed have nodes for both A and B, but it is possible that they were
2153 // never profiled in sequence in a single MIB for any allocation (i.e.
2154 // we might have profiled an allocation that involves the callsite A,
2155 // but through a different one of its callee callsites, and we might
2156 // have profiled an allocation that involves callsite B, but reached
2157 // from a different caller callsite).
2158 if (!Edge) {
2159 Skip = true;
2160 break;
2161 }
2162 PrevNode = CurNode;
2163
2164 // Update the context ids, which is the intersection of the ids along
2165 // all edges in the sequence.
2166 set_intersect(StackSequenceContextIds, Edge->getContextIds());
2167
2168 // If we now have no context ids for clone, skip this call.
2169 if (StackSequenceContextIds.empty()) {
2170 Skip = true;
2171 break;
2172 }
2173 }
2174 if (Skip)
2175 continue;
2176
2177 // If some of this call's stack ids did not have corresponding nodes (due
2178 // to pruning), don't include any context ids for contexts that extend
2179 // beyond these nodes. Otherwise we would be matching part of unrelated /
2180 // not fully matching stack contexts. To do this, subtract any context ids
2181 // found in caller nodes of the last node found above.
2182 if (Ids.back() != getLastStackId(Call)) {
2183 for (const auto &PE : LastNode->CallerEdges) {
2184 set_subtract(StackSequenceContextIds, PE->getContextIds());
2185 if (StackSequenceContextIds.empty())
2186 break;
2187 }
2188 // If we now have no context ids for clone, skip this call.
2189 if (StackSequenceContextIds.empty())
2190 continue;
2191 }
2192
2193#ifndef NDEBUG
2194 // If the prior call had the same stack ids this set would not be empty.
2195 // Check if we already have a call that "matches" because it is located
2196 // in the same function. If the Calls list was sorted properly we should
2197 // not encounter this situation as all such entries should be adjacent
2198 // and processed in bulk further below.
2199 assert(!MatchingIdsFuncSet.contains(Func));
2200
2201 MatchingIdsFuncSet.insert(Func);
2202#endif
2203
2204 // Check if the next set of stack ids is the same (since the Calls vector
2205 // of tuples is sorted by the stack ids we can just look at the next one).
2206 // If so, save them in the CallToMatchingCall map so that they get
2207 // assigned to the same context node, and skip them.
2208 bool DuplicateContextIds = false;
2209 for (unsigned J = I + 1; J < Calls.size(); J++) {
2210 auto &CallCtxInfo = Calls[J];
2211 auto &NextIds = CallCtxInfo.StackIds;
2212 if (NextIds != Ids)
2213 break;
2214 auto *NextFunc = CallCtxInfo.Func;
2215 if (NextFunc != Func) {
2216 // We have another Call with the same ids but that cannot share this
2217 // node, must duplicate ids for it.
2218 DuplicateContextIds = true;
2219 break;
2220 }
2221 auto &NextCall = CallCtxInfo.Call;
2222 CallToMatchingCall[NextCall] = Call;
2223 // Update I so that it gets incremented correctly to skip this call.
2224 I = J;
2225 }
2226
2227 // If we don't have duplicate context ids, then we can assign all the
2228 // context ids computed for the original node sequence to this call.
2229 // If there are duplicate calls with the same stack ids then we synthesize
2230 // new context ids that are duplicates of the originals. These are
2231 // assigned to SavedContextIds, which is a reference into the map entry
2232 // for this call, allowing us to access these ids later on.
2233 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2234 StackSequenceContextIds.size());
2235 SavedContextIds =
2236 DuplicateContextIds
2237 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2238 : StackSequenceContextIds;
2239 assert(!SavedContextIds.empty());
2240
2241 if (!DuplicateContextIds) {
2242 // Update saved last node's context ids to remove those that are
2243 // assigned to other calls, so that it is ready for the next call at
2244 // this stack id.
2245 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2246 if (LastNodeContextIds.empty())
2247 break;
2248 }
2249 }
2250 }
2251
2252 // Propagate the duplicate context ids over the graph.
2253 propagateDuplicateContextIds(OldToNewContextIds);
2254
2255 if (VerifyCCG)
2256 check();
2257
2258 // Now perform a post-order traversal over the graph, starting with the
2259 // allocation nodes, essentially processing nodes from callers to callees.
2260 // For any that contains an id in the map, update the graph to contain new
2261 // nodes representing any inlining at interior callsites. Note we move the
2262 // associated context ids over to the new nodes.
2263 DenseSet<const ContextNode *> Visited;
2264 DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
2265 ImportantContextIdInfo.keys());
2266 for (auto &Entry : AllocationCallToContextNodeMap)
2267 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2268 CallToMatchingCall, ImportantContextIds);
2269
2270 fixupImportantContexts();
2271
2272 if (VerifyCCG)
2273 check();
2274}
2275
2276uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2277 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2278 Call->getMetadata(LLVMContext::MD_callsite));
2279 return CallsiteContext.back();
2280}
2281
2282uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2284 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2285 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2286 // Need to convert index into stack id.
2287 return Index.getStackIdAtIndex(CallsiteContext.back());
2288}
2289
2290static const std::string MemProfCloneSuffix = ".memprof.";
2291
2292static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2293 // We use CloneNo == 0 to refer to the original version, which doesn't get
2294 // renamed with a suffix.
2295 if (!CloneNo)
2296 return Base.str();
2297 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2298}
2299
2300static bool isMemProfClone(const Function &F) {
2301 return F.getName().contains(MemProfCloneSuffix);
2302}
2303
2304// Return the clone number of the given function by extracting it from the
2305// memprof suffix. Assumes the caller has already confirmed it is a memprof
2306// clone.
2307static unsigned getMemProfCloneNum(const Function &F) {
2309 auto Pos = F.getName().find_last_of('.');
2310 assert(Pos > 0);
2311 unsigned CloneNo;
2312 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2313 assert(!Err);
2314 (void)Err;
2315 return CloneNo;
2316}
2317
2318std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2319 const Instruction *Call,
2320 unsigned CloneNo) const {
2321 return (Twine(Call->getFunction()->getName()) + " -> " +
2322 cast<CallBase>(Call)->getCalledFunction()->getName())
2323 .str();
2324}
2325
2326std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2327 const IndexCall &Call,
2328 unsigned CloneNo) const {
2329 auto VI = FSToVIMap.find(Func);
2330 assert(VI != FSToVIMap.end());
2331 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2333 return CallerName + " -> alloc";
2334 else {
2335 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2336 return CallerName + " -> " +
2337 getMemProfFuncName(Callsite->Callee.name(),
2338 Callsite->Clones[CloneNo]);
2339 }
2340}
2341
2342std::vector<uint64_t>
2343ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2344 Instruction *Call) {
2345 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2346 Call->getMetadata(LLVMContext::MD_callsite));
2347 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2348 CallsiteContext);
2349}
2350
2351std::vector<uint64_t>
2352IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2354 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2355 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2356 return getStackIdsWithContextNodes<CallsiteInfo,
2357 SmallVector<unsigned>::const_iterator>(
2358 CallsiteContext);
2359}
2360
2361template <typename DerivedCCG, typename FuncTy, typename CallTy>
2362template <class NodeT, class IteratorT>
2363std::vector<uint64_t>
2364CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2365 CallStack<NodeT, IteratorT> &CallsiteContext) {
2366 std::vector<uint64_t> StackIds;
2367 for (auto IdOrIndex : CallsiteContext) {
2368 auto StackId = getStackId(IdOrIndex);
2369 ContextNode *Node = getNodeForStackId(StackId);
2370 if (!Node)
2371 break;
2372 StackIds.push_back(StackId);
2373 }
2374 return StackIds;
2375}
2376
2377ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2378 Module &M,
2379 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2380 : Mod(M), OREGetter(OREGetter) {
2381 // Map for keeping track of the largest cold contexts up to the number given
2382 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2383 // must be sorted.
2384 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2385 for (auto &F : M) {
2386 std::vector<CallInfo> CallsWithMetadata;
2387 for (auto &BB : F) {
2388 for (auto &I : BB) {
2389 if (!isa<CallBase>(I))
2390 continue;
2391 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2392 CallsWithMetadata.push_back(&I);
2393 auto *AllocNode = addAllocNode(&I, &F);
2394 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2395 assert(CallsiteMD);
2396 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2397 // Add all of the MIBs and their stack nodes.
2398 for (auto &MDOp : MemProfMD->operands()) {
2399 auto *MIBMD = cast<const MDNode>(MDOp);
2400 std::vector<ContextTotalSize> ContextSizeInfo;
2401 // Collect the context size information if it exists.
2402 if (MIBMD->getNumOperands() > 2) {
2403 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2404 MDNode *ContextSizePair =
2405 dyn_cast<MDNode>(MIBMD->getOperand(I));
2406 assert(ContextSizePair->getNumOperands() == 2);
2408 ContextSizePair->getOperand(0))
2409 ->getZExtValue();
2411 ContextSizePair->getOperand(1))
2412 ->getZExtValue();
2413 ContextSizeInfo.push_back({FullStackId, TotalSize});
2414 }
2415 }
2419 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2420 AllocNode, StackContext, CallsiteContext,
2421 getMIBAllocType(MIBMD), ContextSizeInfo,
2422 TotalSizeToContextIdTopNCold);
2423 }
2424 // If exporting the graph to dot and an allocation id of interest was
2425 // specified, record all the context ids for this allocation node.
2426 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2427 DotAllocContextIds = AllocNode->getContextIds();
2428 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2429 // Memprof and callsite metadata on memory allocations no longer
2430 // needed.
2431 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2432 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2433 }
2434 // For callsite metadata, add to list for this function for later use.
2435 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2436 CallsWithMetadata.push_back(&I);
2437 }
2438 }
2439 }
2440 if (!CallsWithMetadata.empty())
2441 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2442 }
2443
2444 if (DumpCCG) {
2445 dbgs() << "CCG before updating call stack chains:\n";
2446 dbgs() << *this;
2447 }
2448
2449 if (ExportToDot)
2450 exportToDot("prestackupdate");
2451
2452 updateStackNodes();
2453
2454 if (ExportToDot)
2455 exportToDot("poststackupdate");
2456
2457 handleCallsitesWithMultipleTargets();
2458
2459 markBackedges();
2460
2461 // Strip off remaining callsite metadata, no longer needed.
2462 for (auto &FuncEntry : FuncToCallsWithMetadata)
2463 for (auto &Call : FuncEntry.second)
2464 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2465}
2466
2467IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2468 ModuleSummaryIndex &Index,
2470 isPrevailing)
2471 : Index(Index), isPrevailing(isPrevailing) {
2472 // Map for keeping track of the largest cold contexts up to the number given
2473 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2474 // must be sorted.
2475 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2476 for (auto &I : Index) {
2477 auto VI = Index.getValueInfo(I);
2478 for (auto &S : VI.getSummaryList()) {
2479 // We should only add the prevailing nodes. Otherwise we may try to clone
2480 // in a weak copy that won't be linked (and may be different than the
2481 // prevailing version).
2482 // We only keep the memprof summary on the prevailing copy now when
2483 // building the combined index, as a space optimization, however don't
2484 // rely on this optimization. The linker doesn't resolve local linkage
2485 // values so don't check whether those are prevailing.
2486 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2487 !isPrevailing(VI.getGUID(), S.get()))
2488 continue;
2489 auto *FS = dyn_cast<FunctionSummary>(S.get());
2490 if (!FS)
2491 continue;
2492 std::vector<CallInfo> CallsWithMetadata;
2493 if (!FS->allocs().empty()) {
2494 for (auto &AN : FS->mutableAllocs()) {
2495 // This can happen because of recursion elimination handling that
2496 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2497 // We still added them to the summary because we need to be able to
2498 // correlate properly in applyImport in the backends.
2499 if (AN.MIBs.empty())
2500 continue;
2501 IndexCall AllocCall(&AN);
2502 CallsWithMetadata.push_back(AllocCall);
2503 auto *AllocNode = addAllocNode(AllocCall, FS);
2504 // Pass an empty CallStack to the CallsiteContext (second)
2505 // parameter, since for ThinLTO we already collapsed out the inlined
2506 // stack ids on the allocation call during ModuleSummaryAnalysis.
2508 EmptyContext;
2509 unsigned I = 0;
2511 AN.ContextSizeInfos.size() == AN.MIBs.size());
2512 // Now add all of the MIBs and their stack nodes.
2513 for (auto &MIB : AN.MIBs) {
2515 StackContext(&MIB);
2516 std::vector<ContextTotalSize> ContextSizeInfo;
2517 if (!AN.ContextSizeInfos.empty()) {
2518 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2519 ContextSizeInfo.push_back({FullStackId, TotalSize});
2520 }
2521 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2522 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2523 ContextSizeInfo, TotalSizeToContextIdTopNCold);
2524 I++;
2525 }
2526 // If exporting the graph to dot and an allocation id of interest was
2527 // specified, record all the context ids for this allocation node.
2528 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2529 DotAllocContextIds = AllocNode->getContextIds();
2530 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2531 // Initialize version 0 on the summary alloc node to the current alloc
2532 // type, unless it has both types in which case make it default, so
2533 // that in the case where we aren't able to clone the original version
2534 // always ends up with the default allocation behavior.
2535 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2536 }
2537 }
2538 // For callsite metadata, add to list for this function for later use.
2539 if (!FS->callsites().empty())
2540 for (auto &SN : FS->mutableCallsites()) {
2541 IndexCall StackNodeCall(&SN);
2542 CallsWithMetadata.push_back(StackNodeCall);
2543 }
2544
2545 if (!CallsWithMetadata.empty())
2546 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2547
2548 if (!FS->allocs().empty() || !FS->callsites().empty())
2549 FSToVIMap[FS] = VI;
2550 }
2551 }
2552
2553 if (DumpCCG) {
2554 dbgs() << "CCG before updating call stack chains:\n";
2555 dbgs() << *this;
2556 }
2557
2558 if (ExportToDot)
2559 exportToDot("prestackupdate");
2560
2561 updateStackNodes();
2562
2563 if (ExportToDot)
2564 exportToDot("poststackupdate");
2565
2566 handleCallsitesWithMultipleTargets();
2567
2568 markBackedges();
2569}
2570
2571template <typename DerivedCCG, typename FuncTy, typename CallTy>
2572void CallsiteContextGraph<DerivedCCG, FuncTy,
2573 CallTy>::handleCallsitesWithMultipleTargets() {
2574 // Look for and workaround callsites that call multiple functions.
2575 // This can happen for indirect calls, which needs better handling, and in
2576 // more rare cases (e.g. macro expansion).
2577 // TODO: To fix this for indirect calls we will want to perform speculative
2578 // devirtualization using either the normal PGO info with ICP, or using the
2579 // information in the profiled MemProf contexts. We can do this prior to
2580 // this transformation for regular LTO, and for ThinLTO we can simulate that
2581 // effect in the summary and perform the actual speculative devirtualization
2582 // while cloning in the ThinLTO backend.
2583
2584 // Keep track of the new nodes synthesized for discovered tail calls missing
2585 // from the profiled contexts.
2586 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2587
2588 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2589 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2590 auto *Node = Entry.second;
2591 assert(Node->Clones.empty());
2592 // Check all node callees and see if in the same function.
2593 // We need to check all of the calls recorded in this Node, because in some
2594 // cases we may have had multiple calls with the same debug info calling
2595 // different callees. This can happen, for example, when an object is
2596 // constructed in the paramter list - the destructor call of the object has
2597 // the same debug info (line/col) as the call the object was passed to.
2598 // Here we will prune any that don't match all callee nodes.
2599 std::vector<CallInfo> AllCalls;
2600 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2601 AllCalls.push_back(Node->Call);
2602 llvm::append_range(AllCalls, Node->MatchingCalls);
2603
2604 // First see if we can partition the calls by callee function, creating new
2605 // nodes to host each set of calls calling the same callees. This is
2606 // necessary for support indirect calls with ThinLTO, for which we
2607 // synthesized CallsiteInfo records for each target. They will all have the
2608 // same callsite stack ids and would be sharing a context node at this
2609 // point. We need to perform separate cloning for each, which will be
2610 // applied along with speculative devirtualization in the ThinLTO backends
2611 // as needed. Note this does not currently support looking through tail
2612 // calls, it is unclear if we need that for indirect call targets.
2613 // First partition calls by callee func. Map indexed by func, value is
2614 // struct with list of matching calls, assigned node.
2615 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2616 continue;
2617
2618 auto It = AllCalls.begin();
2619 // Iterate through the calls until we find the first that matches.
2620 for (; It != AllCalls.end(); ++It) {
2621 auto ThisCall = *It;
2622 bool Match = true;
2623 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2624 ++EI) {
2625 auto Edge = *EI;
2626 if (!Edge->Callee->hasCall())
2627 continue;
2628 assert(NodeToCallingFunc.count(Edge->Callee));
2629 // Check if the called function matches that of the callee node.
2630 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2631 Match = false;
2632 break;
2633 }
2634 }
2635 // Found a call that matches the callee nodes, we can quit now.
2636 if (Match) {
2637 // If the first match is not the primary call on the Node, update it
2638 // now. We will update the list of matching calls further below.
2639 if (Node->Call != ThisCall) {
2640 Node->setCall(ThisCall);
2641 // We need to update the NonAllocationCallToContextNodeMap, but don't
2642 // want to do this during iteration over that map, so save the calls
2643 // that need updated entries.
2644 NewCallToNode.push_back({ThisCall, Node});
2645 }
2646 break;
2647 }
2648 }
2649 // We will update this list below (or leave it cleared if there was no
2650 // match found above).
2651 Node->MatchingCalls.clear();
2652 // If we hit the end of the AllCalls vector, no call matching the callee
2653 // nodes was found, clear the call information in the node.
2654 if (It == AllCalls.end()) {
2655 RemovedEdgesWithMismatchedCallees++;
2656 // Work around by setting Node to have a null call, so it gets
2657 // skipped during cloning. Otherwise assignFunctions will assert
2658 // because its data structures are not designed to handle this case.
2659 Node->setCall(CallInfo());
2660 continue;
2661 }
2662 // Now add back any matching calls that call the same function as the
2663 // matching primary call on Node.
2664 for (++It; It != AllCalls.end(); ++It) {
2665 auto ThisCall = *It;
2666 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2667 continue;
2668 Node->MatchingCalls.push_back(ThisCall);
2669 }
2670 }
2671
2672 // Remove all mismatched nodes identified in the above loop from the node map
2673 // (checking whether they have a null call which is set above). For a
2674 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2675 // to do the removal via remove_if than by individually erasing entries above.
2676 // Also remove any entries if we updated the node's primary call above.
2677 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2678 return !it.second->hasCall() || it.second->Call != it.first;
2679 });
2680
2681 // Add entries for any new primary calls recorded above.
2682 for (auto &[Call, Node] : NewCallToNode)
2683 NonAllocationCallToContextNodeMap[Call] = Node;
2684
2685 // Add the new nodes after the above loop so that the iteration is not
2686 // invalidated.
2687 for (auto &[Call, Node] : TailCallToContextNodeMap)
2688 NonAllocationCallToContextNodeMap[Call] = Node;
2689}
2690
2691template <typename DerivedCCG, typename FuncTy, typename CallTy>
2692bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2693 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2694 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2695 // Struct to keep track of all the calls having the same callee function,
2696 // and the node we eventually assign to them. Eventually we will record the
2697 // context node assigned to this group of calls.
2698 struct CallsWithSameCallee {
2699 std::vector<CallInfo> Calls;
2700 ContextNode *Node = nullptr;
2701 };
2702
2703 // First partition calls by callee function. Build map from each function
2704 // to the list of matching calls.
2706 for (auto ThisCall : AllCalls) {
2707 auto *F = getCalleeFunc(ThisCall.call());
2708 if (F)
2709 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2710 }
2711
2712 // Next, walk through all callee edges. For each callee node, get its
2713 // containing function and see if it was recorded in the above map (meaning we
2714 // have at least one matching call). Build another map from each callee node
2715 // with a matching call to the structure instance created above containing all
2716 // the calls.
2718 for (const auto &Edge : Node->CalleeEdges) {
2719 if (!Edge->Callee->hasCall())
2720 continue;
2721 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2722 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2723 CalleeNodeToCallInfo[Edge->Callee] =
2724 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2725 }
2726
2727 // If there are entries in the second map, then there were no matching
2728 // calls/callees, nothing to do here. Return so we can go to the handling that
2729 // looks through tail calls.
2730 if (CalleeNodeToCallInfo.empty())
2731 return false;
2732
2733 // Walk through all callee edges again. Any and all callee edges that didn't
2734 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2735 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2736 // ignored during cloning. If it is in the map, then we use the node recorded
2737 // in that entry (creating it if needed), and move the callee edge to it.
2738 // The first callee will use the original node instead of creating a new one.
2739 // Note that any of the original calls on this node (in AllCalls) that didn't
2740 // have a callee function automatically get dropped from the node as part of
2741 // this process.
2742 ContextNode *UnmatchedCalleesNode = nullptr;
2743 // Track whether we already assigned original node to a callee.
2744 bool UsedOrigNode = false;
2745 assert(NodeToCallingFunc[Node]);
2746 // Iterate over a copy of Node's callee edges, since we may need to remove
2747 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2748 // makes it less error-prone.
2749 auto CalleeEdges = Node->CalleeEdges;
2750 for (auto &Edge : CalleeEdges) {
2751 if (!Edge->Callee->hasCall())
2752 continue;
2753
2754 // Will be updated below to point to whatever (caller) node this callee edge
2755 // should be moved to.
2756 ContextNode *CallerNodeToUse = nullptr;
2757
2758 // Handle the case where there were no matching calls first. Move this
2759 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2760 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2761 if (!UnmatchedCalleesNode)
2762 UnmatchedCalleesNode =
2763 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2764 CallerNodeToUse = UnmatchedCalleesNode;
2765 } else {
2766 // Look up the information recorded for this callee node, and use the
2767 // recorded caller node (creating it if needed).
2768 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2769 if (!Info->Node) {
2770 // If we haven't assigned any callees to the original node use it.
2771 if (!UsedOrigNode) {
2772 Info->Node = Node;
2773 // Clear the set of matching calls which will be updated below.
2774 Node->MatchingCalls.clear();
2775 UsedOrigNode = true;
2776 } else
2777 Info->Node =
2778 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2779 assert(!Info->Calls.empty());
2780 // The first call becomes the primary call for this caller node, and the
2781 // rest go in the matching calls list.
2782 Info->Node->setCall(Info->Calls.front());
2783 llvm::append_range(Info->Node->MatchingCalls,
2784 llvm::drop_begin(Info->Calls));
2785 // Save the primary call to node correspondence so that we can update
2786 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2787 // caller of this function.
2788 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2789 }
2790 CallerNodeToUse = Info->Node;
2791 }
2792
2793 // Don't need to move edge if we are using the original node;
2794 if (CallerNodeToUse == Node)
2795 continue;
2796
2797 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2798 }
2799 // Now that we are done moving edges, clean up any caller edges that ended
2800 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2801 // caller edges from Node are replicated onto the new callers, and it
2802 // simplifies the handling to leave them until we have moved all
2803 // edges/context ids.
2804 for (auto &I : CalleeNodeToCallInfo)
2805 removeNoneTypeCallerEdges(I.second->Node);
2806 if (UnmatchedCalleesNode)
2807 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2808 removeNoneTypeCallerEdges(Node);
2809
2810 return true;
2811}
2812
2813uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2814 // In the Module (IR) case this is already the Id.
2815 return IdOrIndex;
2816}
2817
2818uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2819 // In the Index case this is an index into the stack id list in the summary
2820 // index, convert it to an Id.
2821 return Index.getStackIdAtIndex(IdOrIndex);
2822}
2823
2824template <typename DerivedCCG, typename FuncTy, typename CallTy>
2825bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2826 CallTy Call, EdgeIter &EI,
2827 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2828 auto Edge = *EI;
2829 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2830 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2831 // Will be populated in order of callee to caller if we find a chain of tail
2832 // calls between the profiled caller and callee.
2833 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2834 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2835 FoundCalleeChain))
2836 return false;
2837
2838 // The usual case where the profiled callee matches that of the IR/summary.
2839 if (FoundCalleeChain.empty())
2840 return true;
2841
2842 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2843 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2844 // If there is already an edge between these nodes, simply update it and
2845 // return.
2846 if (CurEdge) {
2847 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2848 CurEdge->AllocTypes |= Edge->AllocTypes;
2849 return;
2850 }
2851 // Otherwise, create a new edge and insert it into the caller and callee
2852 // lists.
2853 auto NewEdge = std::make_shared<ContextEdge>(
2854 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2855 Callee->CallerEdges.push_back(NewEdge);
2856 if (Caller == Edge->Caller) {
2857 // If we are inserting the new edge into the current edge's caller, insert
2858 // the new edge before the current iterator position, and then increment
2859 // back to the current edge.
2860 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2861 ++EI;
2862 assert(*EI == Edge &&
2863 "Iterator position not restored after insert and increment");
2864 } else
2865 Caller->CalleeEdges.push_back(NewEdge);
2866 };
2867
2868 // Create new nodes for each found callee and connect in between the profiled
2869 // caller and callee.
2870 auto *CurCalleeNode = Edge->Callee;
2871 for (auto &[NewCall, Func] : FoundCalleeChain) {
2872 ContextNode *NewNode = nullptr;
2873 // First check if we have already synthesized a node for this tail call.
2874 if (TailCallToContextNodeMap.count(NewCall)) {
2875 NewNode = TailCallToContextNodeMap[NewCall];
2876 NewNode->AllocTypes |= Edge->AllocTypes;
2877 } else {
2878 FuncToCallsWithMetadata[Func].push_back({NewCall});
2879 // Create Node and record node info.
2880 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2881 TailCallToContextNodeMap[NewCall] = NewNode;
2882 NewNode->AllocTypes = Edge->AllocTypes;
2883 }
2884
2885 // Hook up node to its callee node
2886 AddEdge(NewNode, CurCalleeNode);
2887
2888 CurCalleeNode = NewNode;
2889 }
2890
2891 // Hook up edge's original caller to new callee node.
2892 AddEdge(Edge->Caller, CurCalleeNode);
2893
2894#ifndef NDEBUG
2895 // Save this because Edge's fields get cleared below when removed.
2896 auto *Caller = Edge->Caller;
2897#endif
2898
2899 // Remove old edge
2900 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2901
2902 // To simplify the increment of EI in the caller, subtract one from EI.
2903 // In the final AddEdge call we would have either added a new callee edge,
2904 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2905 // that there is at least one callee edge.
2906 assert(!Caller->CalleeEdges.empty());
2907 --EI;
2908
2909 return true;
2910}
2911
2912bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2913 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2914 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2915 bool &FoundMultipleCalleeChains) {
2916 // Stop recursive search if we have already explored the maximum specified
2917 // depth.
2919 return false;
2920
2921 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2922 FoundCalleeChain.push_back({Callsite, F});
2923 };
2924
2925 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2926 if (!CalleeFunc) {
2927 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2928 assert(Alias);
2929 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2930 assert(CalleeFunc);
2931 }
2932
2933 // Look for tail calls in this function, and check if they either call the
2934 // profiled callee directly, or indirectly (via a recursive search).
2935 // Only succeed if there is a single unique tail call chain found between the
2936 // profiled caller and callee, otherwise we could perform incorrect cloning.
2937 bool FoundSingleCalleeChain = false;
2938 for (auto &BB : *CalleeFunc) {
2939 for (auto &I : BB) {
2940 auto *CB = dyn_cast<CallBase>(&I);
2941 if (!CB || !CB->isTailCall())
2942 continue;
2943 auto *CalledValue = CB->getCalledOperand();
2944 auto *CalledFunction = CB->getCalledFunction();
2945 if (CalledValue && !CalledFunction) {
2946 CalledValue = CalledValue->stripPointerCasts();
2947 // Stripping pointer casts can reveal a called function.
2948 CalledFunction = dyn_cast<Function>(CalledValue);
2949 }
2950 // Check if this is an alias to a function. If so, get the
2951 // called aliasee for the checks below.
2952 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2953 assert(!CalledFunction &&
2954 "Expected null called function in callsite for alias");
2955 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2956 }
2957 if (!CalledFunction)
2958 continue;
2959 if (CalledFunction == ProfiledCallee) {
2960 if (FoundSingleCalleeChain) {
2961 FoundMultipleCalleeChains = true;
2962 return false;
2963 }
2964 FoundSingleCalleeChain = true;
2965 FoundProfiledCalleeCount++;
2966 FoundProfiledCalleeDepth += Depth;
2967 if (Depth > FoundProfiledCalleeMaxDepth)
2968 FoundProfiledCalleeMaxDepth = Depth;
2969 SaveCallsiteInfo(&I, CalleeFunc);
2970 } else if (findProfiledCalleeThroughTailCalls(
2971 ProfiledCallee, CalledFunction, Depth + 1,
2972 FoundCalleeChain, FoundMultipleCalleeChains)) {
2973 // findProfiledCalleeThroughTailCalls should not have returned
2974 // true if FoundMultipleCalleeChains.
2975 assert(!FoundMultipleCalleeChains);
2976 if (FoundSingleCalleeChain) {
2977 FoundMultipleCalleeChains = true;
2978 return false;
2979 }
2980 FoundSingleCalleeChain = true;
2981 SaveCallsiteInfo(&I, CalleeFunc);
2982 } else if (FoundMultipleCalleeChains)
2983 return false;
2984 }
2985 }
2986
2987 return FoundSingleCalleeChain;
2988}
2989
2990const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2991 auto *CB = dyn_cast<CallBase>(Call);
2992 if (!CB->getCalledOperand() || CB->isIndirectCall())
2993 return nullptr;
2994 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2995 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2996 if (Alias)
2997 return dyn_cast<Function>(Alias->getAliasee());
2998 return dyn_cast<Function>(CalleeVal);
2999}
3000
3001bool ModuleCallsiteContextGraph::calleeMatchesFunc(
3002 Instruction *Call, const Function *Func, const Function *CallerFunc,
3003 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
3004 auto *CB = dyn_cast<CallBase>(Call);
3005 if (!CB->getCalledOperand() || CB->isIndirectCall())
3006 return false;
3007 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3008 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
3009 if (CalleeFunc == Func)
3010 return true;
3011 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
3012 if (Alias && Alias->getAliasee() == Func)
3013 return true;
3014
3015 // Recursively search for the profiled callee through tail calls starting with
3016 // the actual Callee. The discovered tail call chain is saved in
3017 // FoundCalleeChain, and we will fixup the graph to include these callsites
3018 // after returning.
3019 // FIXME: We will currently redo the same recursive walk if we find the same
3020 // mismatched callee from another callsite. We can improve this with more
3021 // bookkeeping of the created chain of new nodes for each mismatch.
3022 unsigned Depth = 1;
3023 bool FoundMultipleCalleeChains = false;
3024 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
3025 FoundCalleeChain,
3026 FoundMultipleCalleeChains)) {
3027 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
3028 << Func->getName() << " from " << CallerFunc->getName()
3029 << " that actually called " << CalleeVal->getName()
3030 << (FoundMultipleCalleeChains
3031 ? " (found multiple possible chains)"
3032 : "")
3033 << "\n");
3034 if (FoundMultipleCalleeChains)
3035 FoundProfiledCalleeNonUniquelyCount++;
3036 return false;
3037 }
3038
3039 return true;
3040}
3041
3042bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
3043 Instruction *Call2) {
3044 auto *CB1 = cast<CallBase>(Call1);
3045 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
3046 return false;
3047 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
3048 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
3049 auto *CB2 = cast<CallBase>(Call2);
3050 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
3051 return false;
3052 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
3053 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
3054 return CalleeFunc1 == CalleeFunc2;
3055}
3056
3057bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
3058 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
3059 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
3060 bool &FoundMultipleCalleeChains) {
3061 // Stop recursive search if we have already explored the maximum specified
3062 // depth.
3064 return false;
3065
3066 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
3067 // Make a CallsiteInfo for each discovered callee, if one hasn't already
3068 // been synthesized.
3069 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
3070 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
3071 // StackIds is empty (we don't have debug info available in the index for
3072 // these callsites)
3073 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
3074 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
3075 CallsiteInfo *NewCallsiteInfo =
3076 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
3077 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
3078 };
3079
3080 // Look for tail calls in this function, and check if they either call the
3081 // profiled callee directly, or indirectly (via a recursive search).
3082 // Only succeed if there is a single unique tail call chain found between the
3083 // profiled caller and callee, otherwise we could perform incorrect cloning.
3084 bool FoundSingleCalleeChain = false;
3085 for (auto &S : CurCallee.getSummaryList()) {
3086 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
3087 !isPrevailing(CurCallee.getGUID(), S.get()))
3088 continue;
3089 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
3090 if (!FS)
3091 continue;
3092 auto FSVI = CurCallee;
3093 auto *AS = dyn_cast<AliasSummary>(S.get());
3094 if (AS)
3095 FSVI = AS->getAliaseeVI();
3096 for (auto &CallEdge : FS->calls()) {
3097 if (!CallEdge.second.hasTailCall())
3098 continue;
3099 if (CallEdge.first == ProfiledCallee) {
3100 if (FoundSingleCalleeChain) {
3101 FoundMultipleCalleeChains = true;
3102 return false;
3103 }
3104 FoundSingleCalleeChain = true;
3105 FoundProfiledCalleeCount++;
3106 FoundProfiledCalleeDepth += Depth;
3107 if (Depth > FoundProfiledCalleeMaxDepth)
3108 FoundProfiledCalleeMaxDepth = Depth;
3109 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3110 // Add FS to FSToVIMap in case it isn't already there.
3111 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3112 FSToVIMap[FS] = FSVI;
3113 } else if (findProfiledCalleeThroughTailCalls(
3114 ProfiledCallee, CallEdge.first, Depth + 1,
3115 FoundCalleeChain, FoundMultipleCalleeChains)) {
3116 // findProfiledCalleeThroughTailCalls should not have returned
3117 // true if FoundMultipleCalleeChains.
3118 assert(!FoundMultipleCalleeChains);
3119 if (FoundSingleCalleeChain) {
3120 FoundMultipleCalleeChains = true;
3121 return false;
3122 }
3123 FoundSingleCalleeChain = true;
3124 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3125 // Add FS to FSToVIMap in case it isn't already there.
3126 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3127 FSToVIMap[FS] = FSVI;
3128 } else if (FoundMultipleCalleeChains)
3129 return false;
3130 }
3131 }
3132
3133 return FoundSingleCalleeChain;
3134}
3135
3136const FunctionSummary *
3137IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
3138 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3139 if (Callee.getSummaryList().empty())
3140 return nullptr;
3141 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
3142}
3143
3144bool IndexCallsiteContextGraph::calleeMatchesFunc(
3145 IndexCall &Call, const FunctionSummary *Func,
3146 const FunctionSummary *CallerFunc,
3147 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
3148 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3149 // If there is no summary list then this is a call to an externally defined
3150 // symbol.
3151 AliasSummary *Alias =
3152 Callee.getSummaryList().empty()
3153 ? nullptr
3154 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
3155 assert(FSToVIMap.count(Func));
3156 auto FuncVI = FSToVIMap[Func];
3157 if (Callee == FuncVI ||
3158 // If callee is an alias, check the aliasee, since only function
3159 // summary base objects will contain the stack node summaries and thus
3160 // get a context node.
3161 (Alias && Alias->getAliaseeVI() == FuncVI))
3162 return true;
3163
3164 // Recursively search for the profiled callee through tail calls starting with
3165 // the actual Callee. The discovered tail call chain is saved in
3166 // FoundCalleeChain, and we will fixup the graph to include these callsites
3167 // after returning.
3168 // FIXME: We will currently redo the same recursive walk if we find the same
3169 // mismatched callee from another callsite. We can improve this with more
3170 // bookkeeping of the created chain of new nodes for each mismatch.
3171 unsigned Depth = 1;
3172 bool FoundMultipleCalleeChains = false;
3173 if (!findProfiledCalleeThroughTailCalls(
3174 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
3175 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
3176 << " from " << FSToVIMap[CallerFunc]
3177 << " that actually called " << Callee
3178 << (FoundMultipleCalleeChains
3179 ? " (found multiple possible chains)"
3180 : "")
3181 << "\n");
3182 if (FoundMultipleCalleeChains)
3183 FoundProfiledCalleeNonUniquelyCount++;
3184 return false;
3185 }
3186
3187 return true;
3188}
3189
3190bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
3191 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
3192 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
3193 return Callee1 == Callee2;
3194}
3195
3196template <typename DerivedCCG, typename FuncTy, typename CallTy>
3197void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
3198 const {
3199 print(dbgs());
3200 dbgs() << "\n";
3201}
3202
3203template <typename DerivedCCG, typename FuncTy, typename CallTy>
3204void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
3205 raw_ostream &OS) const {
3206 OS << "Node " << this << "\n";
3207 OS << "\t";
3208 printCall(OS);
3209 if (Recursive)
3210 OS << " (recursive)";
3211 OS << "\n";
3212 if (!MatchingCalls.empty()) {
3213 OS << "\tMatchingCalls:\n";
3214 for (auto &MatchingCall : MatchingCalls) {
3215 OS << "\t";
3216 MatchingCall.print(OS);
3217 OS << "\n";
3218 }
3219 }
3220 OS << "\tNodeId: " << NodeId << "\n";
3221 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
3222 OS << "\tContextIds:";
3223 // Make a copy of the computed context ids that we can sort for stability.
3224 auto ContextIds = getContextIds();
3225 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3226 std::sort(SortedIds.begin(), SortedIds.end());
3227 for (auto Id : SortedIds)
3228 OS << " " << Id;
3229 OS << "\n";
3230 OS << "\tCalleeEdges:\n";
3231 for (auto &Edge : CalleeEdges)
3232 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3233 << ")\n";
3234 OS << "\tCallerEdges:\n";
3235 for (auto &Edge : CallerEdges)
3236 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3237 << ")\n";
3238 if (!Clones.empty()) {
3239 OS << "\tClones: ";
3240 bool First = true;
3241 for (auto *C : Clones) {
3242 if (!First)
3243 OS << ", ";
3244 First = false;
3245 OS << C << " NodeId: " << C->NodeId;
3246 }
3247 OS << "\n";
3248 } else if (CloneOf) {
3249 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3250 }
3251}
3252
3253template <typename DerivedCCG, typename FuncTy, typename CallTy>
3254void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3255 const {
3256 print(dbgs());
3257 dbgs() << "\n";
3258}
3259
3260template <typename DerivedCCG, typename FuncTy, typename CallTy>
3261void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3262 raw_ostream &OS) const {
3263 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3264 << (IsBackedge ? " (BE)" : "")
3265 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3266 OS << " ContextIds:";
3267 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3268 std::sort(SortedIds.begin(), SortedIds.end());
3269 for (auto Id : SortedIds)
3270 OS << " " << Id;
3271}
3272
3273template <typename DerivedCCG, typename FuncTy, typename CallTy>
3274void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3275 print(dbgs());
3276}
3277
3278template <typename DerivedCCG, typename FuncTy, typename CallTy>
3279void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3280 raw_ostream &OS) const {
3281 OS << "Callsite Context Graph:\n";
3282 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3283 for (const auto Node : nodes<GraphType>(this)) {
3284 if (Node->isRemoved())
3285 continue;
3286 Node->print(OS);
3287 OS << "\n";
3288 }
3289}
3290
3291template <typename DerivedCCG, typename FuncTy, typename CallTy>
3292void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3293 raw_ostream &OS) const {
3294 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3295 for (const auto Node : nodes<GraphType>(this)) {
3296 if (Node->isRemoved())
3297 continue;
3298 if (!Node->IsAllocation)
3299 continue;
3300 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3301 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3302 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3303 std::sort(SortedIds.begin(), SortedIds.end());
3304 for (auto Id : SortedIds) {
3305 auto TypeI = ContextIdToAllocationType.find(Id);
3306 assert(TypeI != ContextIdToAllocationType.end());
3307 auto CSI = ContextIdToContextSizeInfos.find(Id);
3308 if (CSI != ContextIdToContextSizeInfos.end()) {
3309 for (auto &Info : CSI->second) {
3310 OS << "MemProf hinting: "
3311 << getAllocTypeString((uint8_t)TypeI->second)
3312 << " full allocation context " << Info.FullStackId
3313 << " with total size " << Info.TotalSize << " is "
3314 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3315 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3316 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3317 << " due to cold byte percent";
3318 // Print the internal context id to aid debugging and visualization.
3319 OS << " (context id " << Id << ")";
3320 OS << "\n";
3321 }
3322 }
3323 }
3324 }
3325}
3326
3327template <typename DerivedCCG, typename FuncTy, typename CallTy>
3328void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3329 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3330 for (const auto Node : nodes<GraphType>(this)) {
3331 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3332 for (auto &Edge : Node->CallerEdges)
3334 }
3335}
3336
3337template <typename DerivedCCG, typename FuncTy, typename CallTy>
3338struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3339 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3340 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3341
3342 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3343 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3344
3347 decltype(&getNode)>;
3348
3350 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3351 }
3352
3354 return nodes_iterator(G->NodeOwner.end(), &getNode);
3355 }
3356
3358 return G->NodeOwner.begin()->get();
3359 }
3360
3361 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3362 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3364 return P->Callee;
3365 }
3366
3368 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3369 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3370 decltype(&GetCallee)>;
3371
3373 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3374 }
3375
3377 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3378 }
3379};
3380
3381template <typename DerivedCCG, typename FuncTy, typename CallTy>
3382struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3383 : public DefaultDOTGraphTraits {
3384 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3385 // If the user requested the full graph to be exported, but provided an
3386 // allocation id, or if the user gave a context id and requested more than
3387 // just a specific context to be exported, note that highlighting is
3388 // enabled.
3389 DoHighlight =
3390 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3391 (ContextIdForDot.getNumOccurrences() &&
3393 }
3394
3395 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3397 using NodeRef = typename GTraits::NodeRef;
3398 using ChildIteratorType = typename GTraits::ChildIteratorType;
3399
3400 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3401 std::string LabelString =
3402 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3403 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3404 .str();
3405 LabelString += "\n";
3406 if (Node->hasCall()) {
3407 auto Func = G->NodeToCallingFunc.find(Node);
3408 assert(Func != G->NodeToCallingFunc.end());
3409 LabelString +=
3410 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3411 } else {
3412 LabelString += "null call";
3413 if (Node->Recursive)
3414 LabelString += " (recursive)";
3415 else
3416 LabelString += " (external)";
3417 }
3418 return LabelString;
3419 }
3420
3422 auto ContextIds = Node->getContextIds();
3423 // If highlighting enabled, see if this node contains any of the context ids
3424 // of interest. If so, it will use a different color and a larger fontsize
3425 // (which makes the node larger as well).
3426 bool Highlight = false;
3427 if (DoHighlight) {
3428 assert(ContextIdForDot.getNumOccurrences() ||
3429 AllocIdForDot.getNumOccurrences());
3430 if (ContextIdForDot.getNumOccurrences())
3431 Highlight = ContextIds.contains(ContextIdForDot);
3432 else
3433 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3434 }
3435 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3436 getContextIds(ContextIds) + "\"")
3437 .str();
3438 // Default fontsize is 14
3439 if (Highlight)
3440 AttributeString += ",fontsize=\"30\"";
3441 AttributeString +=
3442 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3443 .str();
3444 if (Node->CloneOf) {
3445 AttributeString += ",color=\"blue\"";
3446 AttributeString += ",style=\"filled,bold,dashed\"";
3447 } else
3448 AttributeString += ",style=\"filled\"";
3449 return AttributeString;
3450 }
3451
3452 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3453 GraphType G) {
3454 auto &Edge = *(ChildIter.getCurrent());
3455 // If highlighting enabled, see if this edge contains any of the context ids
3456 // of interest. If so, it will use a different color and a heavier arrow
3457 // size and weight (the larger weight makes the highlighted path
3458 // straighter).
3459 bool Highlight = false;
3460 if (DoHighlight) {
3461 assert(ContextIdForDot.getNumOccurrences() ||
3462 AllocIdForDot.getNumOccurrences());
3463 if (ContextIdForDot.getNumOccurrences())
3464 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3465 else
3466 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3467 }
3468 auto Color = getColor(Edge->AllocTypes, Highlight);
3469 std::string AttributeString =
3470 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3471 // fillcolor is the arrow head and color is the line
3472 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3473 "\"")
3474 .str();
3475 if (Edge->IsBackedge)
3476 AttributeString += ",style=\"dotted\"";
3477 // Default penwidth and weight are both 1.
3478 if (Highlight)
3479 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3480 return AttributeString;
3481 }
3482
3483 // Since the NodeOwners list includes nodes that are no longer connected to
3484 // the graph, skip them here.
3486 if (Node->isRemoved())
3487 return true;
3488 // If a scope smaller than the full graph was requested, see if this node
3489 // contains any of the context ids of interest.
3491 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3493 return !Node->getContextIds().contains(ContextIdForDot);
3494 return false;
3495 }
3496
3497private:
3498 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3499 std::string IdString = "ContextIds:";
3500 if (ContextIds.size() < 100) {
3501 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3502 std::sort(SortedIds.begin(), SortedIds.end());
3503 for (auto Id : SortedIds)
3504 IdString += (" " + Twine(Id)).str();
3505 } else {
3506 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3507 }
3508 return IdString;
3509 }
3510
3511 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3512 // If DoHighlight is not enabled, we want to use the highlight colors for
3513 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3514 // both compatible with the color scheme before highlighting was supported,
3515 // and for the NotCold+Cold color the non-highlight color is a bit more
3516 // readable.
3517 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3518 // Color "brown1" actually looks like a lighter red.
3519 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3520 if (AllocTypes == (uint8_t)AllocationType::Cold)
3521 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3522 if (AllocTypes ==
3523 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3524 return Highlight ? "magenta" : "mediumorchid1";
3525 return "gray";
3526 }
3527
3528 static std::string getNodeId(NodeRef Node) {
3529 std::stringstream SStream;
3530 SStream << std::hex << "N0x" << (unsigned long long)Node;
3531 std::string Result = SStream.str();
3532 return Result;
3533 }
3534
3535 // True if we should highlight a specific context or allocation's contexts in
3536 // the emitted graph.
3537 static bool DoHighlight;
3538};
3539
3540template <typename DerivedCCG, typename FuncTy, typename CallTy>
3541bool DOTGraphTraits<
3542 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3543 false;
3544
3545template <typename DerivedCCG, typename FuncTy, typename CallTy>
3546void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3547 std::string Label) const {
3548 WriteGraph(this, "", false, Label,
3549 DotFilePathPrefix + "ccg." + Label + ".dot");
3550}
3551
3552template <typename DerivedCCG, typename FuncTy, typename CallTy>
3553typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3554CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3555 const std::shared_ptr<ContextEdge> &Edge,
3556 DenseSet<uint32_t> ContextIdsToMove) {
3557 ContextNode *Node = Edge->Callee;
3558 assert(NodeToCallingFunc.count(Node));
3559 ContextNode *Clone =
3560 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3561 Node->addClone(Clone);
3562 Clone->MatchingCalls = Node->MatchingCalls;
3563 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3564 ContextIdsToMove);
3565 return Clone;
3566}
3567
3568template <typename DerivedCCG, typename FuncTy, typename CallTy>
3569void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3570 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3571 ContextNode *NewCallee, bool NewClone,
3572 DenseSet<uint32_t> ContextIdsToMove) {
3573 // NewCallee and Edge's current callee must be clones of the same original
3574 // node (Edge's current callee may be the original node too).
3575 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3576
3577 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3578
3579 ContextNode *OldCallee = Edge->Callee;
3580
3581 // We might already have an edge to the new callee from earlier cloning for a
3582 // different allocation. If one exists we will reuse it.
3583 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3584
3585 // Callers will pass an empty ContextIdsToMove set when they want to move the
3586 // edge. Copy in Edge's ids for simplicity.
3587 if (ContextIdsToMove.empty())
3588 ContextIdsToMove = Edge->getContextIds();
3589
3590 // If we are moving all of Edge's ids, then just move the whole Edge.
3591 // Otherwise only move the specified subset, to a new edge if needed.
3592 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3593 // First, update the alloc types on New Callee from Edge.
3594 // Do this before we potentially clear Edge's fields below!
3595 NewCallee->AllocTypes |= Edge->AllocTypes;
3596 // Moving the whole Edge.
3597 if (ExistingEdgeToNewCallee) {
3598 // Since we already have an edge to NewCallee, simply move the ids
3599 // onto it, and remove the existing Edge.
3600 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3601 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3602 assert(Edge->ContextIds == ContextIdsToMove);
3603 removeEdgeFromGraph(Edge.get());
3604 } else {
3605 // Otherwise just reconnect Edge to NewCallee.
3606 Edge->Callee = NewCallee;
3607 NewCallee->CallerEdges.push_back(Edge);
3608 // Remove it from callee where it was previously connected.
3609 OldCallee->eraseCallerEdge(Edge.get());
3610 // Don't need to update Edge's context ids since we are simply
3611 // reconnecting it.
3612 }
3613 } else {
3614 // Only moving a subset of Edge's ids.
3615 // Compute the alloc type of the subset of ids being moved.
3616 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3617 if (ExistingEdgeToNewCallee) {
3618 // Since we already have an edge to NewCallee, simply move the ids
3619 // onto it.
3620 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3621 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3622 } else {
3623 // Otherwise, create a new edge to NewCallee for the ids being moved.
3624 auto NewEdge = std::make_shared<ContextEdge>(
3625 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3626 Edge->Caller->CalleeEdges.push_back(NewEdge);
3627 NewCallee->CallerEdges.push_back(NewEdge);
3628 }
3629 // In either case, need to update the alloc types on NewCallee, and remove
3630 // those ids and update the alloc type on the original Edge.
3631 NewCallee->AllocTypes |= CallerEdgeAllocType;
3632 set_subtract(Edge->ContextIds, ContextIdsToMove);
3633 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3634 }
3635 // Now walk the old callee node's callee edges and move Edge's context ids
3636 // over to the corresponding edge into the clone (which is created here if
3637 // this is a newly created clone).
3638 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3639 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3640 // If this is a direct recursion edge, use NewCallee (the clone) as the
3641 // callee as well, so that any edge updated/created here is also direct
3642 // recursive.
3643 if (CalleeToUse == OldCallee) {
3644 // If this is a recursive edge, see if we already moved a recursive edge
3645 // (which would have to have been this one) - if we were only moving a
3646 // subset of context ids it would still be on OldCallee.
3647 if (EdgeIsRecursive) {
3648 assert(OldCalleeEdge == Edge);
3649 continue;
3650 }
3651 CalleeToUse = NewCallee;
3652 }
3653 // The context ids moving to the new callee are the subset of this edge's
3654 // context ids and the context ids on the caller edge being moved.
3655 DenseSet<uint32_t> EdgeContextIdsToMove =
3656 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3657 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3658 OldCalleeEdge->AllocTypes =
3659 computeAllocType(OldCalleeEdge->getContextIds());
3660 if (!NewClone) {
3661 // Update context ids / alloc type on corresponding edge to NewCallee.
3662 // There is a chance this may not exist if we are reusing an existing
3663 // clone, specifically during function assignment, where we would have
3664 // removed none type edges after creating the clone. If we can't find
3665 // a corresponding edge there, fall through to the cloning below.
3666 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3667 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3668 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3669 continue;
3670 }
3671 }
3672 auto NewEdge = std::make_shared<ContextEdge>(
3673 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3674 EdgeContextIdsToMove);
3675 NewCallee->CalleeEdges.push_back(NewEdge);
3676 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3677 }
3678 // Recompute the node alloc type now that its callee edges have been
3679 // updated (since we will compute from those edges).
3680 OldCallee->AllocTypes = OldCallee->computeAllocType();
3681 // OldCallee alloc type should be None iff its context id set is now empty.
3682 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3683 OldCallee->emptyContextIds());
3684 if (VerifyCCG) {
3685 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3686 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3687 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3688 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3689 /*CheckEdges=*/false);
3690 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3691 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3692 /*CheckEdges=*/false);
3693 }
3694}
3695
3696template <typename DerivedCCG, typename FuncTy, typename CallTy>
3697void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3698 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3699 ContextNode *NewCaller) {
3700 auto *OldCallee = Edge->Callee;
3701 auto *NewCallee = OldCallee;
3702 // If this edge was direct recursive, make any new/updated edge also direct
3703 // recursive to NewCaller.
3704 bool Recursive = Edge->Caller == Edge->Callee;
3705 if (Recursive)
3706 NewCallee = NewCaller;
3707
3708 ContextNode *OldCaller = Edge->Caller;
3709 OldCaller->eraseCalleeEdge(Edge.get());
3710
3711 // We might already have an edge to the new caller. If one exists we will
3712 // reuse it.
3713 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3714
3715 if (ExistingEdgeToNewCaller) {
3716 // Since we already have an edge to NewCaller, simply move the ids
3717 // onto it, and remove the existing Edge.
3718 ExistingEdgeToNewCaller->getContextIds().insert_range(
3719 Edge->getContextIds());
3720 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3721 Edge->ContextIds.clear();
3722 Edge->AllocTypes = (uint8_t)AllocationType::None;
3723 OldCallee->eraseCallerEdge(Edge.get());
3724 } else {
3725 // Otherwise just reconnect Edge to NewCaller.
3726 Edge->Caller = NewCaller;
3727 NewCaller->CalleeEdges.push_back(Edge);
3728 if (Recursive) {
3729 assert(NewCallee == NewCaller);
3730 // In the case of (direct) recursive edges, we update the callee as well
3731 // so that it becomes recursive on the new caller.
3732 Edge->Callee = NewCallee;
3733 NewCallee->CallerEdges.push_back(Edge);
3734 OldCallee->eraseCallerEdge(Edge.get());
3735 }
3736 // Don't need to update Edge's context ids since we are simply
3737 // reconnecting it.
3738 }
3739 // In either case, need to update the alloc types on New Caller.
3740 NewCaller->AllocTypes |= Edge->AllocTypes;
3741
3742 // Now walk the old caller node's caller edges and move Edge's context ids
3743 // over to the corresponding edge into the node (which is created here if
3744 // this is a newly created node). We can tell whether this is a newly created
3745 // node by seeing if it has any caller edges yet.
3746#ifndef NDEBUG
3747 bool IsNewNode = NewCaller->CallerEdges.empty();
3748#endif
3749 // If we just moved a direct recursive edge, presumably its context ids should
3750 // also flow out of OldCaller via some other non-recursive callee edge. We
3751 // don't want to remove the recursive context ids from other caller edges yet,
3752 // otherwise the context ids get into an inconsistent state on OldCaller.
3753 // We will update these context ids on the non-recursive caller edge when and
3754 // if they are updated on the non-recursive callee.
3755 if (!Recursive) {
3756 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3757 auto OldCallerCaller = OldCallerEdge->Caller;
3758 // The context ids moving to the new caller are the subset of this edge's
3759 // context ids and the context ids on the callee edge being moved.
3760 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3761 OldCallerEdge->getContextIds(), Edge->getContextIds());
3762 if (OldCaller == OldCallerCaller) {
3763 OldCallerCaller = NewCaller;
3764 // Don't actually move this one. The caller will move it directly via a
3765 // call to this function with this as the Edge if it is appropriate to
3766 // move to a diff node that has a matching callee (itself).
3767 continue;
3768 }
3769 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3770 OldCallerEdge->AllocTypes =
3771 computeAllocType(OldCallerEdge->getContextIds());
3772 // In this function we expect that any pre-existing node already has edges
3773 // from the same callers as the old node. That should be true in the
3774 // current use case, where we will remove None-type edges after copying
3775 // over all caller edges from the callee.
3776 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3777 // Since we would have skipped caller edges when moving a direct recursive
3778 // edge, this may not hold true when recursive handling enabled.
3779 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3780 if (ExistingCallerEdge) {
3781 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3782 ExistingCallerEdge->AllocTypes |=
3783 computeAllocType(EdgeContextIdsToMove);
3784 continue;
3785 }
3786 auto NewEdge = std::make_shared<ContextEdge>(
3787 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3788 EdgeContextIdsToMove);
3789 NewCaller->CallerEdges.push_back(NewEdge);
3790 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3791 }
3792 }
3793 // Recompute the node alloc type now that its caller edges have been
3794 // updated (since we will compute from those edges).
3795 OldCaller->AllocTypes = OldCaller->computeAllocType();
3796 // OldCaller alloc type should be None iff its context id set is now empty.
3797 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3798 OldCaller->emptyContextIds());
3799 if (VerifyCCG) {
3800 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3801 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3802 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3803 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3804 /*CheckEdges=*/false);
3805 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3806 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3807 /*CheckEdges=*/false);
3808 }
3809}
3810
3811template <typename DerivedCCG, typename FuncTy, typename CallTy>
3812void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3813 recursivelyRemoveNoneTypeCalleeEdges(
3814 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3815 auto Inserted = Visited.insert(Node);
3816 if (!Inserted.second)
3817 return;
3818
3819 removeNoneTypeCalleeEdges(Node);
3820
3821 for (auto *Clone : Node->Clones)
3822 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3823
3824 // The recursive call may remove some of this Node's caller edges.
3825 // Iterate over a copy and skip any that were removed.
3826 auto CallerEdges = Node->CallerEdges;
3827 for (auto &Edge : CallerEdges) {
3828 // Skip any that have been removed by an earlier recursive call.
3829 if (Edge->isRemoved()) {
3830 assert(!is_contained(Node->CallerEdges, Edge));
3831 continue;
3832 }
3833 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3834 }
3835}
3836
3837// This is the standard DFS based backedge discovery algorithm.
3838template <typename DerivedCCG, typename FuncTy, typename CallTy>
3839void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3840 // If we are cloning recursive contexts, find and mark backedges from all root
3841 // callers, using the typical DFS based backedge analysis.
3843 return;
3844 DenseSet<const ContextNode *> Visited;
3845 DenseSet<const ContextNode *> CurrentStack;
3846 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3847 auto *Node = Entry.second;
3848 if (Node->isRemoved())
3849 continue;
3850 // It is a root if it doesn't have callers.
3851 if (!Node->CallerEdges.empty())
3852 continue;
3853 markBackedges(Node, Visited, CurrentStack);
3854 assert(CurrentStack.empty());
3855 }
3856}
3857
3858// Recursive helper for above markBackedges method.
3859template <typename DerivedCCG, typename FuncTy, typename CallTy>
3860void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3861 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3862 DenseSet<const ContextNode *> &CurrentStack) {
3863 auto I = Visited.insert(Node);
3864 // We should only call this for unvisited nodes.
3865 assert(I.second);
3866 (void)I;
3867 for (auto &CalleeEdge : Node->CalleeEdges) {
3868 auto *Callee = CalleeEdge->Callee;
3869 if (Visited.count(Callee)) {
3870 // Since this was already visited we need to check if it is currently on
3871 // the recursive stack in which case it is a backedge.
3872 if (CurrentStack.count(Callee))
3873 CalleeEdge->IsBackedge = true;
3874 continue;
3875 }
3876 CurrentStack.insert(Callee);
3877 markBackedges(Callee, Visited, CurrentStack);
3878 CurrentStack.erase(Callee);
3879 }
3880}
3881
3882template <typename DerivedCCG, typename FuncTy, typename CallTy>
3883void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3884 DenseSet<const ContextNode *> Visited;
3885 for (auto &Entry : AllocationCallToContextNodeMap) {
3886 Visited.clear();
3887 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3888 }
3889 Visited.clear();
3890 for (auto &Entry : AllocationCallToContextNodeMap)
3891 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3892 if (VerifyCCG)
3893 check();
3894}
3895
3896// helper function to check an AllocType is cold or notcold or both.
3903
3904template <typename DerivedCCG, typename FuncTy, typename CallTy>
3905void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3906 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3907 const DenseSet<uint32_t> &AllocContextIds) {
3908 if (VerifyNodes)
3909 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3910 assert(!Node->CloneOf);
3911
3912 // If Node as a null call, then either it wasn't found in the module (regular
3913 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3914 // cloning (e.g. recursion, calls multiple targets, etc).
3915 // Do this here so that we don't try to recursively clone callers below, which
3916 // isn't useful at least for this node.
3917 if (!Node->hasCall())
3918 return;
3919
3920 // No need to look at any callers if allocation type already unambiguous.
3921 if (hasSingleAllocType(Node->AllocTypes))
3922 return;
3923
3924#ifndef NDEBUG
3925 auto Insert =
3926#endif
3927 Visited.insert(Node);
3928 // We should not have visited this node yet.
3929 assert(Insert.second);
3930 // The recursive call to identifyClones may delete the current edge from the
3931 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3932 // in an iterator and having recursive call erase from it. Other edges may
3933 // also get removed during the recursion, which will have null Callee and
3934 // Caller pointers (and are deleted later), so we skip those below.
3935 {
3936 auto CallerEdges = Node->CallerEdges;
3937 for (auto &Edge : CallerEdges) {
3938 // Skip any that have been removed by an earlier recursive call.
3939 if (Edge->isRemoved()) {
3940 assert(!is_contained(Node->CallerEdges, Edge));
3941 continue;
3942 }
3943 // Defer backedges. See comments further below where these edges are
3944 // handled during the cloning of this Node.
3945 if (Edge->IsBackedge) {
3946 // We should only mark these if cloning recursive contexts, where we
3947 // need to do this deferral.
3949 continue;
3950 }
3951 // Ignore any caller we previously visited via another edge.
3952 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3953 identifyClones(Edge->Caller, Visited, AllocContextIds);
3954 }
3955 }
3956 }
3957
3958 // Check if we reached an unambiguous call or have have only a single caller.
3959 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3960 return;
3961
3962 // We need to clone.
3963
3964 // Try to keep the original version as alloc type NotCold. This will make
3965 // cases with indirect calls or any other situation with an unknown call to
3966 // the original function get the default behavior. We do this by sorting the
3967 // CallerEdges of the Node we will clone by alloc type.
3968 //
3969 // Give NotCold edge the lowest sort priority so those edges are at the end of
3970 // the caller edges vector, and stay on the original version (since the below
3971 // code clones greedily until it finds all remaining edges have the same type
3972 // and leaves the remaining ones on the original Node).
3973 //
3974 // We shouldn't actually have any None type edges, so the sorting priority for
3975 // that is arbitrary, and we assert in that case below.
3976 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3977 /*Cold*/ 1,
3978 /*NotColdCold*/ 2};
3979 llvm::stable_sort(Node->CallerEdges,
3980 [&](const std::shared_ptr<ContextEdge> &A,
3981 const std::shared_ptr<ContextEdge> &B) {
3982 // Nodes with non-empty context ids should be sorted
3983 // before those with empty context ids.
3984 if (A->ContextIds.empty())
3985 // Either B ContextIds are non-empty (in which case we
3986 // should return false because B < A), or B ContextIds
3987 // are empty, in which case they are equal, and we
3988 // should maintain the original relative ordering.
3989 return false;
3990 if (B->ContextIds.empty())
3991 return true;
3992
3993 if (A->AllocTypes == B->AllocTypes)
3994 // Use the first context id for each edge as a
3995 // tie-breaker.
3996 return *A->ContextIds.begin() < *B->ContextIds.begin();
3997 return AllocTypeCloningPriority[A->AllocTypes] <
3998 AllocTypeCloningPriority[B->AllocTypes];
3999 });
4000
4001 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4002
4003 DenseSet<uint32_t> RecursiveContextIds;
4005 // If we are allowing recursive callsites, but have also disabled recursive
4006 // contexts, look for context ids that show up in multiple caller edges.
4008 DenseSet<uint32_t> AllCallerContextIds;
4009 for (auto &CE : Node->CallerEdges) {
4010 // Resize to the largest set of caller context ids, since we know the
4011 // final set will be at least that large.
4012 AllCallerContextIds.reserve(CE->getContextIds().size());
4013 for (auto Id : CE->getContextIds())
4014 if (!AllCallerContextIds.insert(Id).second)
4015 RecursiveContextIds.insert(Id);
4016 }
4017 }
4018
4019 // Iterate until we find no more opportunities for disambiguating the alloc
4020 // types via cloning. In most cases this loop will terminate once the Node
4021 // has a single allocation type, in which case no more cloning is needed.
4022 // Iterate over a copy of Node's caller edges, since we may need to remove
4023 // edges in the moveEdgeTo* methods, and this simplifies the handling and
4024 // makes it less error-prone.
4025 auto CallerEdges = Node->CallerEdges;
4026 for (auto &CallerEdge : CallerEdges) {
4027 // Skip any that have been removed by an earlier recursive call.
4028 if (CallerEdge->isRemoved()) {
4029 assert(!is_contained(Node->CallerEdges, CallerEdge));
4030 continue;
4031 }
4032 assert(CallerEdge->Callee == Node);
4033
4034 // See if cloning the prior caller edge left this node with a single alloc
4035 // type or a single caller. In that case no more cloning of Node is needed.
4036 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4037 break;
4038
4039 // If the caller was not successfully matched to a call in the IR/summary,
4040 // there is no point in trying to clone for it as we can't update that call.
4041 if (!CallerEdge->Caller->hasCall())
4042 continue;
4043
4044 // Only need to process the ids along this edge pertaining to the given
4045 // allocation.
4046 auto CallerEdgeContextsForAlloc =
4047 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
4048 if (!RecursiveContextIds.empty())
4049 CallerEdgeContextsForAlloc =
4050 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
4051 if (CallerEdgeContextsForAlloc.empty())
4052 continue;
4053
4054 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4055
4056 // Compute the node callee edge alloc types corresponding to the context ids
4057 // for this caller edge.
4058 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
4059 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
4060 for (auto &CalleeEdge : Node->CalleeEdges)
4061 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4062 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4063
4064 // Don't clone if doing so will not disambiguate any alloc types amongst
4065 // caller edges (including the callee edges that would be cloned).
4066 // Otherwise we will simply move all edges to the clone.
4067 //
4068 // First check if by cloning we will disambiguate the caller allocation
4069 // type from node's allocation type. Query allocTypeToUse so that we don't
4070 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
4071 // neither of these should be None type.
4072 //
4073 // Then check if by cloning node at least one of the callee edges will be
4074 // disambiguated by splitting out different context ids.
4075 //
4076 // However, always do the cloning if this is a backedge, in which case we
4077 // have not yet cloned along this caller edge.
4078 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
4079 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4080 if (!CallerEdge->IsBackedge &&
4081 allocTypeToUse(CallerAllocTypeForAlloc) ==
4082 allocTypeToUse(Node->AllocTypes) &&
4083 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
4084 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
4085 continue;
4086 }
4087
4088 if (CallerEdge->IsBackedge) {
4089 // We should only mark these if cloning recursive contexts, where we
4090 // need to do this deferral.
4092 DeferredBackedges++;
4093 }
4094
4095 // If this is a backedge, we now do recursive cloning starting from its
4096 // caller since we may have moved unambiguous caller contexts to a clone
4097 // of this Node in a previous iteration of the current loop, giving more
4098 // opportunity for cloning through the backedge. Because we sorted the
4099 // caller edges earlier so that cold caller edges are first, we would have
4100 // visited and cloned this node for any unamibiguously cold non-recursive
4101 // callers before any ambiguous backedge callers. Note that we don't do this
4102 // if the caller is already cloned or visited during cloning (e.g. via a
4103 // different context path from the allocation).
4104 // TODO: Can we do better in the case where the caller was already visited?
4105 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
4106 !Visited.count(CallerEdge->Caller)) {
4107 const auto OrigIdCount = CallerEdge->getContextIds().size();
4108 // Now do the recursive cloning of this backedge's caller, which was
4109 // deferred earlier.
4110 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
4111 removeNoneTypeCalleeEdges(CallerEdge->Caller);
4112 // See if the recursive call to identifyClones moved the context ids to a
4113 // new edge from this node to a clone of caller, and switch to looking at
4114 // that new edge so that we clone Node for the new caller clone.
4115 bool UpdatedEdge = false;
4116 if (OrigIdCount > CallerEdge->getContextIds().size()) {
4117 for (auto E : Node->CallerEdges) {
4118 // Only interested in clones of the current edges caller.
4119 if (E->Caller->CloneOf != CallerEdge->Caller)
4120 continue;
4121 // See if this edge contains any of the context ids originally on the
4122 // current caller edge.
4123 auto CallerEdgeContextsForAllocNew =
4124 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
4125 if (CallerEdgeContextsForAllocNew.empty())
4126 continue;
4127 // Make sure we don't pick a previously existing caller edge of this
4128 // Node, which would be processed on a different iteration of the
4129 // outer loop over the saved CallerEdges.
4130 if (llvm::is_contained(CallerEdges, E))
4131 continue;
4132 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
4133 // are updated further below for all cases where we just invoked
4134 // identifyClones recursively.
4135 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
4136 CallerEdge = E;
4137 UpdatedEdge = true;
4138 break;
4139 }
4140 }
4141 // If cloning removed this edge (and we didn't update it to a new edge
4142 // above), we're done with this edge. It's possible we moved all of the
4143 // context ids to an existing clone, in which case there's no need to do
4144 // further processing for them.
4145 if (CallerEdge->isRemoved())
4146 continue;
4147
4148 // Now we need to update the information used for the cloning decisions
4149 // further below, as we may have modified edges and their context ids.
4150
4151 // Note if we changed the CallerEdge above we would have already updated
4152 // the context ids.
4153 if (!UpdatedEdge) {
4154 CallerEdgeContextsForAlloc = set_intersection(
4155 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
4156 if (CallerEdgeContextsForAlloc.empty())
4157 continue;
4158 }
4159 // Update the other information that depends on the edges and on the now
4160 // updated CallerEdgeContextsForAlloc.
4161 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4162 CalleeEdgeAllocTypesForCallerEdge.clear();
4163 for (auto &CalleeEdge : Node->CalleeEdges) {
4164 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4165 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4166 }
4167 }
4168
4169 // First see if we can use an existing clone. Check each clone and its
4170 // callee edges for matching alloc types.
4171 ContextNode *Clone = nullptr;
4172 for (auto *CurClone : Node->Clones) {
4173 if (allocTypeToUse(CurClone->AllocTypes) !=
4174 allocTypeToUse(CallerAllocTypeForAlloc))
4175 continue;
4176
4177 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
4178 hasSingleAllocType(CallerAllocTypeForAlloc);
4179 // The above check should mean that if both have single alloc types that
4180 // they should be equal.
4181 assert(!BothSingleAlloc ||
4182 CurClone->AllocTypes == CallerAllocTypeForAlloc);
4183
4184 // If either both have a single alloc type (which are the same), or if the
4185 // clone's callee edges have the same alloc types as those for the current
4186 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
4187 // then we can reuse this clone.
4188 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
4189 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
4190 Clone = CurClone;
4191 break;
4192 }
4193 }
4194
4195 // The edge iterator is adjusted when we move the CallerEdge to the clone.
4196 if (Clone)
4197 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
4198 CallerEdgeContextsForAlloc);
4199 else
4200 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
4201
4202 // Sanity check that no alloc types on clone or its edges are None.
4203 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
4204 }
4205
4206 // We should still have some context ids on the original Node.
4207 assert(!Node->emptyContextIds());
4208
4209 // Sanity check that no alloc types on node or edges are None.
4210 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4211
4212 if (VerifyNodes)
4213 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
4214}
4215
4216void ModuleCallsiteContextGraph::updateAllocationCall(
4217 CallInfo &Call, AllocationType AllocType) {
4218 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
4220 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
4221 "memprof", AllocTypeString);
4222 cast<CallBase>(Call.call())->addFnAttr(A);
4223 OREGetter(Call.call()->getFunction())
4224 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
4225 << ore::NV("AllocationCall", Call.call()) << " in clone "
4226 << ore::NV("Caller", Call.call()->getFunction())
4227 << " marked with memprof allocation attribute "
4228 << ore::NV("Attribute", AllocTypeString));
4229}
4230
4231void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4233 auto *AI = cast<AllocInfo *>(Call.call());
4234 assert(AI);
4235 assert(AI->Versions.size() > Call.cloneNo());
4236 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4237}
4238
4240ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4241 const auto *CB = cast<CallBase>(Call.call());
4242 if (!CB->getAttributes().hasFnAttr("memprof"))
4243 return AllocationType::None;
4244 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4245 ? AllocationType::Cold
4246 : AllocationType::NotCold;
4247}
4248
4250IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4251 const auto *AI = cast<AllocInfo *>(Call.call());
4252 assert(AI->Versions.size() > Call.cloneNo());
4253 return (AllocationType)AI->Versions[Call.cloneNo()];
4254}
4255
4256void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4257 FuncInfo CalleeFunc) {
4258 auto *CurF = getCalleeFunc(CallerCall.call());
4259 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4260 if (isMemProfClone(*CurF)) {
4261 // If we already assigned this callsite to call a specific non-default
4262 // clone (i.e. not the original function which is clone 0), ensure that we
4263 // aren't trying to now update it to call a different clone, which is
4264 // indicative of a bug in the graph or function assignment.
4265 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4266 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4267 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4268 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4269 << "\n");
4270 MismatchedCloneAssignments++;
4271 }
4272 }
4273 if (NewCalleeCloneNo > 0)
4274 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4275 OREGetter(CallerCall.call()->getFunction())
4276 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4277 << ore::NV("Call", CallerCall.call()) << " in clone "
4278 << ore::NV("Caller", CallerCall.call()->getFunction())
4279 << " assigned to call function clone "
4280 << ore::NV("Callee", CalleeFunc.func()));
4281}
4282
4283void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4284 FuncInfo CalleeFunc) {
4285 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4286 assert(CI &&
4287 "Caller cannot be an allocation which should not have profiled calls");
4288 assert(CI->Clones.size() > CallerCall.cloneNo());
4289 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4290 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4291 // If we already assigned this callsite to call a specific non-default
4292 // clone (i.e. not the original function which is clone 0), ensure that we
4293 // aren't trying to now update it to call a different clone, which is
4294 // indicative of a bug in the graph or function assignment.
4295 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4296 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4297 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4298 << "\n");
4299 MismatchedCloneAssignments++;
4300 }
4301 CurCalleeCloneNo = NewCalleeCloneNo;
4302}
4303
4304// Update the debug information attached to NewFunc to use the clone Name. Note
4305// this needs to be done for both any existing DISubprogram for the definition,
4306// as well as any separate declaration DISubprogram.
4308 assert(Name == NewFunc->getName());
4309 auto *SP = NewFunc->getSubprogram();
4310 if (!SP)
4311 return;
4312 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4313 SP->replaceLinkageName(MDName);
4314 DISubprogram *Decl = SP->getDeclaration();
4315 if (!Decl)
4316 return;
4317 TempDISubprogram NewDecl = Decl->clone();
4318 NewDecl->replaceLinkageName(MDName);
4319 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4320}
4321
4322CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4323 Instruction *>::FuncInfo
4324ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4325 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4326 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4327 // Use existing LLVM facilities for cloning and obtaining Call in clone
4328 ValueToValueMapTy VMap;
4329 auto *NewFunc = CloneFunction(Func.func(), VMap);
4330 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4331 assert(!Func.func()->getParent()->getFunction(Name));
4332 NewFunc->setName(Name);
4333 updateSubprogramLinkageName(NewFunc, Name);
4334 for (auto &Inst : CallsWithMetadataInFunc) {
4335 // This map always has the initial version in it.
4336 assert(Inst.cloneNo() == 0);
4337 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4338 }
4339 OREGetter(Func.func())
4340 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4341 << "created clone " << ore::NV("NewFunction", NewFunc));
4342 return {NewFunc, CloneNo};
4343}
4344
4345CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4346 IndexCall>::FuncInfo
4347IndexCallsiteContextGraph::cloneFunctionForCallsite(
4348 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4349 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4350 // Check how many clones we have of Call (and therefore function).
4351 // The next clone number is the current size of versions array.
4352 // Confirm this matches the CloneNo provided by the caller, which is based on
4353 // the number of function clones we have.
4354 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4355 ? cast<AllocInfo *>(Call.call())->Versions.size()
4356 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4357 // Walk all the instructions in this function. Create a new version for
4358 // each (by adding an entry to the Versions/Clones summary array), and copy
4359 // over the version being called for the function clone being cloned here.
4360 // Additionally, add an entry to the CallMap for the new function clone,
4361 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4362 // to the new call clone.
4363 for (auto &Inst : CallsWithMetadataInFunc) {
4364 // This map always has the initial version in it.
4365 assert(Inst.cloneNo() == 0);
4366 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4367 assert(AI->Versions.size() == CloneNo);
4368 // We assign the allocation type later (in updateAllocationCall), just add
4369 // an entry for it here.
4370 AI->Versions.push_back(0);
4371 } else {
4372 auto *CI = cast<CallsiteInfo *>(Inst.call());
4373 assert(CI && CI->Clones.size() == CloneNo);
4374 // We assign the clone number later (in updateCall), just add an entry for
4375 // it here.
4376 CI->Clones.push_back(0);
4377 }
4378 CallMap[Inst] = {Inst.call(), CloneNo};
4379 }
4380 return {Func.func(), CloneNo};
4381}
4382
4383// We perform cloning for each allocation node separately. However, this
4384// sometimes results in a situation where the same node calls multiple
4385// clones of the same callee, created for different allocations. This
4386// causes issues when assigning functions to these clones, as each node can
4387// in reality only call a single callee clone.
4388//
4389// To address this, before assigning functions, merge callee clone nodes as
4390// needed using a post order traversal from the allocations. We attempt to
4391// use existing clones as the merge node when legal, and to share them
4392// among callers with the same properties (callers calling the same set of
4393// callee clone nodes for the same allocations).
4394//
4395// Without this fix, in some cases incorrect function assignment will lead
4396// to calling the wrong allocation clone.
4397template <typename DerivedCCG, typename FuncTy, typename CallTy>
4398void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4399 if (!MergeClones)
4400 return;
4401
4402 // Generate a map from context id to the associated allocation node for use
4403 // when merging clones.
4404 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4405 for (auto &Entry : AllocationCallToContextNodeMap) {
4406 auto *Node = Entry.second;
4407 for (auto Id : Node->getContextIds())
4408 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4409 for (auto *Clone : Node->Clones) {
4410 for (auto Id : Clone->getContextIds())
4411 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4412 }
4413 }
4414
4415 // Post order traversal starting from allocations to ensure each callsite
4416 // calls a single clone of its callee. Callee nodes that are clones of each
4417 // other are merged (via new merge nodes if needed) to achieve this.
4418 DenseSet<const ContextNode *> Visited;
4419 for (auto &Entry : AllocationCallToContextNodeMap) {
4420 auto *Node = Entry.second;
4421
4422 mergeClones(Node, Visited, ContextIdToAllocationNode);
4423
4424 // Make a copy so the recursive post order traversal that may create new
4425 // clones doesn't mess up iteration. Note that the recursive traversal
4426 // itself does not call mergeClones on any of these nodes, which are all
4427 // (clones of) allocations.
4428 auto Clones = Node->Clones;
4429 for (auto *Clone : Clones)
4430 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4431 }
4432
4433 if (DumpCCG) {
4434 dbgs() << "CCG after merging:\n";
4435 dbgs() << *this;
4436 }
4437 if (ExportToDot)
4438 exportToDot("aftermerge");
4439
4440 if (VerifyCCG) {
4441 check();
4442 }
4443}
4444
4445// Recursive helper for above mergeClones method.
4446template <typename DerivedCCG, typename FuncTy, typename CallTy>
4447void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4448 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4449 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4450 auto Inserted = Visited.insert(Node);
4451 if (!Inserted.second)
4452 return;
4453
4454 // Iteratively perform merging on this node to handle new caller nodes created
4455 // during the recursive traversal. We could do something more elegant such as
4456 // maintain a worklist, but this is a simple approach that doesn't cause a
4457 // measureable compile time effect, as most nodes don't have many caller
4458 // edges to check.
4459 bool FoundUnvisited = true;
4460 unsigned Iters = 0;
4461 while (FoundUnvisited) {
4462 Iters++;
4463 FoundUnvisited = false;
4464 // Make a copy since the recursive call may move a caller edge to a new
4465 // callee, messing up the iterator.
4466 auto CallerEdges = Node->CallerEdges;
4467 for (auto CallerEdge : CallerEdges) {
4468 // Skip any caller edge moved onto a different callee during recursion.
4469 if (CallerEdge->Callee != Node)
4470 continue;
4471 // If we found an unvisited caller, note that we should check the caller
4472 // edges again as mergeClones may add or change caller nodes.
4473 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4474 FoundUnvisited = true;
4475 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4476 }
4477 }
4478
4479 TotalMergeInvokes++;
4480 TotalMergeIters += Iters;
4481 if (Iters > MaxMergeIters)
4482 MaxMergeIters = Iters;
4483
4484 // Merge for this node after we handle its callers.
4485 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4486}
4487
4488template <typename DerivedCCG, typename FuncTy, typename CallTy>
4489void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4490 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4491 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4492 // Ignore Node if we moved all of its contexts to clones.
4493 if (Node->emptyContextIds())
4494 return;
4495
4496 // First identify groups of clones among Node's callee edges, by building
4497 // a map from each callee base node to the associated callee edges from Node.
4498 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4499 OrigNodeToCloneEdges;
4500 for (const auto &E : Node->CalleeEdges) {
4501 auto *Callee = E->Callee;
4502 if (!Callee->CloneOf && Callee->Clones.empty())
4503 continue;
4504 ContextNode *Base = Callee->getOrigNode();
4505 OrigNodeToCloneEdges[Base].push_back(E);
4506 }
4507
4508 // Helper for callee edge sorting below. Return true if A's callee has fewer
4509 // caller edges than B, or if A is a clone and B is not, or if A's first
4510 // context id is smaller than B's.
4511 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4512 const std::shared_ptr<ContextEdge> &B) {
4513 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4514 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4515 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4516 return true;
4517 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4518 return false;
4519 // Use the first context id for each edge as a
4520 // tie-breaker.
4521 return *A->ContextIds.begin() < *B->ContextIds.begin();
4522 };
4523
4524 // Process each set of callee clones called by Node, performing the needed
4525 // merging.
4526 for (auto Entry : OrigNodeToCloneEdges) {
4527 // CalleeEdges is the set of edges from Node reaching callees that are
4528 // mutual clones of each other.
4529 auto &CalleeEdges = Entry.second;
4530 auto NumCalleeClones = CalleeEdges.size();
4531 // A single edge means there is no merging needed.
4532 if (NumCalleeClones == 1)
4533 continue;
4534 // Sort the CalleeEdges calling this group of clones in ascending order of
4535 // their caller edge counts, putting the original non-clone node first in
4536 // cases of a tie. This simplifies finding an existing node to use as the
4537 // merge node.
4538 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4539
4540 /// Find other callers of the given set of callee edges that can
4541 /// share the same callee merge node. See the comments at this method
4542 /// definition for details.
4543 DenseSet<ContextNode *> OtherCallersToShareMerge;
4544 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4545 OtherCallersToShareMerge);
4546
4547 // Now do the actual merging. Identify existing or create a new MergeNode
4548 // during the first iteration. Move each callee over, along with edges from
4549 // other callers we've determined above can share the same merge node.
4550 ContextNode *MergeNode = nullptr;
4551 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4552 for (auto CalleeEdge : CalleeEdges) {
4553 auto *OrigCallee = CalleeEdge->Callee;
4554 // If we don't have a MergeNode yet (only happens on the first iteration,
4555 // as a new one will be created when we go to move the first callee edge
4556 // over as needed), see if we can use this callee.
4557 if (!MergeNode) {
4558 // If there are no other callers, simply use this callee.
4559 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4560 MergeNode = OrigCallee;
4561 NonNewMergedNodes++;
4562 continue;
4563 }
4564 // Otherwise, if we have identified other caller nodes that can share
4565 // the merge node with Node, see if all of OrigCallee's callers are
4566 // going to share the same merge node. In that case we can use callee
4567 // (since all of its callers would move to the new merge node).
4568 if (!OtherCallersToShareMerge.empty()) {
4569 bool MoveAllCallerEdges = true;
4570 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4571 if (CalleeCallerE == CalleeEdge)
4572 continue;
4573 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4574 MoveAllCallerEdges = false;
4575 break;
4576 }
4577 }
4578 // If we are going to move all callers over, we can use this callee as
4579 // the MergeNode.
4580 if (MoveAllCallerEdges) {
4581 MergeNode = OrigCallee;
4582 NonNewMergedNodes++;
4583 continue;
4584 }
4585 }
4586 }
4587 // Move this callee edge, creating a new merge node if necessary.
4588 if (MergeNode) {
4589 assert(MergeNode != OrigCallee);
4590 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4591 /*NewClone*/ false);
4592 } else {
4593 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4594 NewMergedNodes++;
4595 }
4596 // Now move all identified edges from other callers over to the merge node
4597 // as well.
4598 if (!OtherCallersToShareMerge.empty()) {
4599 // Make and iterate over a copy of OrigCallee's caller edges because
4600 // some of these will be moved off of the OrigCallee and that would mess
4601 // up the iteration from OrigCallee.
4602 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4603 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4604 if (CalleeCallerE == CalleeEdge)
4605 continue;
4606 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4607 continue;
4608 CallerToMoveCount[CalleeCallerE->Caller]++;
4609 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4610 /*NewClone*/ false);
4611 }
4612 }
4613 removeNoneTypeCalleeEdges(OrigCallee);
4614 removeNoneTypeCalleeEdges(MergeNode);
4615 }
4616 }
4617}
4618
4619// Look for other nodes that have edges to the same set of callee
4620// clones as the current Node. Those can share the eventual merge node
4621// (reducing cloning and binary size overhead) iff:
4622// - they have edges to the same set of callee clones
4623// - each callee edge reaches a subset of the same allocations as Node's
4624// corresponding edge to the same callee clone.
4625// The second requirement is to ensure that we don't undo any of the
4626// necessary cloning to distinguish contexts with different allocation
4627// behavior.
4628// FIXME: This is somewhat conservative, as we really just need to ensure
4629// that they don't reach the same allocations as contexts on edges from Node
4630// going to any of the *other* callee clones being merged. However, that
4631// requires more tracking and checking to get right.
4632template <typename DerivedCCG, typename FuncTy, typename CallTy>
4633void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4634 findOtherCallersToShareMerge(
4635 ContextNode *Node,
4636 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4637 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4638 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4639 auto NumCalleeClones = CalleeEdges.size();
4640 // This map counts how many edges to the same callee clone exist for other
4641 // caller nodes of each callee clone.
4642 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4643 // Counts the number of other caller nodes that have edges to all callee
4644 // clones that don't violate the allocation context checking.
4645 unsigned PossibleOtherCallerNodes = 0;
4646
4647 // We only need to look at other Caller nodes if the first callee edge has
4648 // multiple callers (recall they are sorted in ascending order above).
4649 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4650 return;
4651
4652 // For each callee edge:
4653 // - Collect the count of other caller nodes calling the same callees.
4654 // - Collect the alloc nodes reached by contexts on each callee edge.
4655 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4656 for (auto CalleeEdge : CalleeEdges) {
4657 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4658 // For each other caller of the same callee, increment the count of
4659 // edges reaching the same callee clone.
4660 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4661 if (CalleeCallerEdges->Caller == Node) {
4662 assert(CalleeCallerEdges == CalleeEdge);
4663 continue;
4664 }
4665 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4666 // If this caller edge now reaches all of the same callee clones,
4667 // increment the count of candidate other caller nodes.
4668 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4669 NumCalleeClones)
4670 PossibleOtherCallerNodes++;
4671 }
4672 // Collect the alloc nodes reached by contexts on each callee edge, for
4673 // later analysis.
4674 for (auto Id : CalleeEdge->getContextIds()) {
4675 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4676 if (!Alloc) {
4677 // FIXME: unclear why this happens occasionally, presumably
4678 // imperfect graph updates possibly with recursion.
4679 MissingAllocForContextId++;
4680 continue;
4681 }
4682 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4683 }
4684 }
4685
4686 // Now walk the callee edges again, and make sure that for each candidate
4687 // caller node all of its edges to the callees reach the same allocs (or
4688 // a subset) as those along the corresponding callee edge from Node.
4689 for (auto CalleeEdge : CalleeEdges) {
4690 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4691 // Stop if we do not have any (more) candidate other caller nodes.
4692 if (!PossibleOtherCallerNodes)
4693 break;
4694 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4695 // Check each other caller of this callee clone.
4696 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4697 // Not interested in the callee edge from Node itself.
4698 if (CalleeCallerE == CalleeEdge)
4699 continue;
4700 // Skip any callers that didn't have callee edges to all the same
4701 // callee clones.
4702 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4703 NumCalleeClones)
4704 continue;
4705 // Make sure that each context along edge from candidate caller node
4706 // reaches an allocation also reached by this callee edge from Node.
4707 for (auto Id : CalleeCallerE->getContextIds()) {
4708 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4709 if (!Alloc)
4710 continue;
4711 // If not, simply reset the map entry to 0 so caller is ignored, and
4712 // reduce the count of candidate other caller nodes.
4713 if (!CurCalleeAllocNodes.contains(Alloc)) {
4714 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4715 PossibleOtherCallerNodes--;
4716 break;
4717 }
4718 }
4719 }
4720 }
4721
4722 if (!PossibleOtherCallerNodes)
4723 return;
4724
4725 // Build the set of other caller nodes that can use the same callee merge
4726 // node.
4727 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4728 if (Count != NumCalleeClones)
4729 continue;
4730 OtherCallersToShareMerge.insert(OtherCaller);
4731 }
4732}
4733
4734// This method assigns cloned callsites to functions, cloning the functions as
4735// needed. The assignment is greedy and proceeds roughly as follows:
4736//
4737// For each function Func:
4738// For each call with graph Node having clones:
4739// Initialize ClonesWorklist to Node and its clones
4740// Initialize NodeCloneCount to 0
4741// While ClonesWorklist is not empty:
4742// Clone = pop front ClonesWorklist
4743// NodeCloneCount++
4744// If Func has been cloned less than NodeCloneCount times:
4745// If NodeCloneCount is 1:
4746// Assign Clone to original Func
4747// Continue
4748// Create a new function clone
4749// If other callers not assigned to call a function clone yet:
4750// Assign them to call new function clone
4751// Continue
4752// Assign any other caller calling the cloned version to new clone
4753//
4754// For each caller of Clone:
4755// If caller is assigned to call a specific function clone:
4756// If we cannot assign Clone to that function clone:
4757// Create new callsite Clone NewClone
4758// Add NewClone to ClonesWorklist
4759// Continue
4760// Assign Clone to existing caller's called function clone
4761// Else:
4762// If Clone not already assigned to a function clone:
4763// Assign to first function clone without assignment
4764// Assign caller to selected function clone
4765// For each call with graph Node having clones:
4766// If number func clones > number call's callsite Node clones:
4767// Record func CallInfo clones without Node clone in UnassignedCallClones
4768// For callsite Nodes in DFS order from allocations:
4769// If IsAllocation:
4770// Update allocation with alloc type
4771// Else:
4772// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4773// Update call to call recorded callee clone
4774//
4775template <typename DerivedCCG, typename FuncTy, typename CallTy>
4776bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4777 bool Changed = false;
4778
4779 mergeClones();
4780
4781 // Keep track of the assignment of nodes (callsites) to function clones they
4782 // call.
4783 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4784
4785 // Update caller node to call function version CalleeFunc, by recording the
4786 // assignment in CallsiteToCalleeFuncCloneMap.
4787 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4788 const FuncInfo &CalleeFunc) {
4789 assert(Caller->hasCall());
4790 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4791 };
4792
4793 // Information for a single clone of this Func.
4794 struct FuncCloneInfo {
4795 // The function clone.
4796 FuncInfo FuncClone;
4797 // Remappings of each call of interest (from original uncloned call to the
4798 // corresponding cloned call in this function clone).
4799 DenseMap<CallInfo, CallInfo> CallMap;
4800 };
4801
4802 // Map to keep track of information needed to update calls in function clones
4803 // when their corresponding callsite node was not itself cloned for that
4804 // function clone. Because of call context pruning (i.e. we only keep as much
4805 // caller information as needed to distinguish hot vs cold), we may not have
4806 // caller edges coming to each callsite node from all possible function
4807 // callers. A function clone may get created for other callsites in the
4808 // function for which there are caller edges that were not pruned. Any other
4809 // callsites in that function clone, which were not themselved cloned for
4810 // that function clone, should get updated the same way as the corresponding
4811 // callsite in the original function (which may call a clone of its callee).
4812 //
4813 // We build this map after completing function cloning for each function, so
4814 // that we can record the information from its call maps before they are
4815 // destructed. The map will be used as we update calls to update any still
4816 // unassigned call clones. Note that we may create new node clones as we clone
4817 // other functions, so later on we check which node clones were still not
4818 // created. To this end, the inner map is a map from function clone number to
4819 // the list of calls cloned for that function (can be more than one due to the
4820 // Node's MatchingCalls array).
4821 //
4822 // The alternative is creating new callsite clone nodes below as we clone the
4823 // function, but that is tricker to get right and likely more overhead.
4824 //
4825 // Inner map is a std::map so sorted by key (clone number), in order to get
4826 // ordered remarks in the full LTO case.
4827 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4828 UnassignedCallClones;
4829
4830 // Walk all functions for which we saw calls with memprof metadata, and handle
4831 // cloning for each of its calls.
4832 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4833 FuncInfo OrigFunc(Func);
4834 // Map from each clone number of OrigFunc to information about that function
4835 // clone (the function clone FuncInfo and call remappings). The index into
4836 // the vector is the clone number, as function clones are created and
4837 // numbered sequentially.
4838 std::vector<FuncCloneInfo> FuncCloneInfos;
4839 for (auto &Call : CallsWithMetadata) {
4840 ContextNode *Node = getNodeForInst(Call);
4841 // Skip call if we do not have a node for it (all uses of its stack ids
4842 // were either on inlined chains or pruned from the MIBs), or if we did
4843 // not create any clones for it.
4844 if (!Node || Node->Clones.empty())
4845 continue;
4846 assert(Node->hasCall() &&
4847 "Not having a call should have prevented cloning");
4848
4849 // Track the assignment of function clones to clones of the current
4850 // callsite Node being handled.
4851 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4852
4853 // Assign callsite version CallsiteClone to function version FuncClone,
4854 // and also assign (possibly cloned) Call to CallsiteClone.
4855 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4856 CallInfo &Call,
4857 ContextNode *CallsiteClone,
4858 bool IsAlloc) {
4859 // Record the clone of callsite node assigned to this function clone.
4860 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4861
4862 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4863 DenseMap<CallInfo, CallInfo> &CallMap =
4864 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4865 CallInfo CallClone(Call);
4866 if (auto It = CallMap.find(Call); It != CallMap.end())
4867 CallClone = It->second;
4868 CallsiteClone->setCall(CallClone);
4869 // Need to do the same for all matching calls.
4870 for (auto &MatchingCall : Node->MatchingCalls) {
4871 CallInfo CallClone(MatchingCall);
4872 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4873 CallClone = It->second;
4874 // Updates the call in the list.
4875 MatchingCall = CallClone;
4876 }
4877 };
4878
4879 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4880 // performs the necessary fixups (removing none type edges, and
4881 // importantly, propagating any function call assignment of the original
4882 // node to the new clone).
4883 auto MoveEdgeToNewCalleeCloneAndSetUp =
4884 [&](const std::shared_ptr<ContextEdge> &Edge) {
4885 ContextNode *OrigCallee = Edge->Callee;
4886 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4887 removeNoneTypeCalleeEdges(NewClone);
4888 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4889 // If the original Callee was already assigned to call a specific
4890 // function version, make sure its new clone is assigned to call
4891 // that same function clone.
4892 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4893 RecordCalleeFuncOfCallsite(
4894 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4895 return NewClone;
4896 };
4897
4898 // Keep track of the clones of callsite Node that need to be assigned to
4899 // function clones. This list may be expanded in the loop body below if we
4900 // find additional cloning is required.
4901 std::deque<ContextNode *> ClonesWorklist;
4902 // Ignore original Node if we moved all of its contexts to clones.
4903 if (!Node->emptyContextIds())
4904 ClonesWorklist.push_back(Node);
4905 llvm::append_range(ClonesWorklist, Node->Clones);
4906
4907 // Now walk through all of the clones of this callsite Node that we need,
4908 // and determine the assignment to a corresponding clone of the current
4909 // function (creating new function clones as needed).
4910 unsigned NodeCloneCount = 0;
4911 while (!ClonesWorklist.empty()) {
4912 ContextNode *Clone = ClonesWorklist.front();
4913 ClonesWorklist.pop_front();
4914 NodeCloneCount++;
4915 if (VerifyNodes)
4917
4918 // Need to create a new function clone if we have more callsite clones
4919 // than existing function clones, which would have been assigned to an
4920 // earlier clone in the list (we assign callsite clones to function
4921 // clones greedily).
4922 if (FuncCloneInfos.size() < NodeCloneCount) {
4923 // If this is the first callsite copy, assign to original function.
4924 if (NodeCloneCount == 1) {
4925 // Since FuncCloneInfos is empty in this case, no clones have
4926 // been created for this function yet, and no callers should have
4927 // been assigned a function clone for this callee node yet.
4929 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4930 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4931 }));
4932 // Initialize with empty call map, assign Clone to original function
4933 // and its callers, and skip to the next clone.
4934 FuncCloneInfos.push_back(
4935 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4936 AssignCallsiteCloneToFuncClone(
4937 OrigFunc, Call, Clone,
4938 AllocationCallToContextNodeMap.count(Call));
4939 for (auto &CE : Clone->CallerEdges) {
4940 // Ignore any caller that does not have a recorded callsite Call.
4941 if (!CE->Caller->hasCall())
4942 continue;
4943 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4944 }
4945 continue;
4946 }
4947
4948 // First locate which copy of OrigFunc to clone again. If a caller
4949 // of this callsite clone was already assigned to call a particular
4950 // function clone, we need to redirect all of those callers to the
4951 // new function clone, and update their other callees within this
4952 // function.
4953 FuncInfo PreviousAssignedFuncClone;
4954 auto EI = llvm::find_if(
4955 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4956 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4957 });
4958 bool CallerAssignedToCloneOfFunc = false;
4959 if (EI != Clone->CallerEdges.end()) {
4960 const std::shared_ptr<ContextEdge> &Edge = *EI;
4961 PreviousAssignedFuncClone =
4962 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4963 CallerAssignedToCloneOfFunc = true;
4964 }
4965
4966 // Clone function and save it along with the CallInfo map created
4967 // during cloning in the FuncCloneInfos.
4968 DenseMap<CallInfo, CallInfo> NewCallMap;
4969 unsigned CloneNo = FuncCloneInfos.size();
4970 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4971 "should already exist in the map");
4972 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4973 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
4974 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
4975 FunctionClonesAnalysis++;
4976 Changed = true;
4977
4978 // If no caller callsites were already assigned to a clone of this
4979 // function, we can simply assign this clone to the new func clone
4980 // and update all callers to it, then skip to the next clone.
4981 if (!CallerAssignedToCloneOfFunc) {
4982 AssignCallsiteCloneToFuncClone(
4983 NewFuncClone, Call, Clone,
4984 AllocationCallToContextNodeMap.count(Call));
4985 for (auto &CE : Clone->CallerEdges) {
4986 // Ignore any caller that does not have a recorded callsite Call.
4987 if (!CE->Caller->hasCall())
4988 continue;
4989 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4990 }
4991 continue;
4992 }
4993
4994 // We may need to do additional node cloning in this case.
4995 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4996 // that were previously assigned to call PreviousAssignedFuncClone,
4997 // to record that they now call NewFuncClone.
4998 // The none type edge removal may remove some of this Clone's caller
4999 // edges, if it is reached via another of its caller's callees.
5000 // Iterate over a copy and skip any that were removed.
5001 auto CallerEdges = Clone->CallerEdges;
5002 for (auto CE : CallerEdges) {
5003 // Skip any that have been removed on an earlier iteration.
5004 if (CE->isRemoved()) {
5005 assert(!is_contained(Clone->CallerEdges, CE));
5006 continue;
5007 }
5008 assert(CE);
5009 // Ignore any caller that does not have a recorded callsite Call.
5010 if (!CE->Caller->hasCall())
5011 continue;
5012
5013 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
5014 // We subsequently fall through to later handling that
5015 // will perform any additional cloning required for
5016 // callers that were calling other function clones.
5017 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
5018 PreviousAssignedFuncClone)
5019 continue;
5020
5021 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5022
5023 // If we are cloning a function that was already assigned to some
5024 // callers, then essentially we are creating new callsite clones
5025 // of the other callsites in that function that are reached by those
5026 // callers. Clone the other callees of the current callsite's caller
5027 // that were already assigned to PreviousAssignedFuncClone
5028 // accordingly. This is important since we subsequently update the
5029 // calls from the nodes in the graph and their assignments to callee
5030 // functions recorded in CallsiteToCalleeFuncCloneMap.
5031 // The none type edge removal may remove some of this caller's
5032 // callee edges, if it is reached via another of its callees.
5033 // Iterate over a copy and skip any that were removed.
5034 auto CalleeEdges = CE->Caller->CalleeEdges;
5035 for (auto CalleeEdge : CalleeEdges) {
5036 // Skip any that have been removed on an earlier iteration when
5037 // cleaning up newly None type callee edges.
5038 if (CalleeEdge->isRemoved()) {
5039 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
5040 continue;
5041 }
5042 assert(CalleeEdge);
5043 ContextNode *Callee = CalleeEdge->Callee;
5044 // Skip the current callsite, we are looking for other
5045 // callsites Caller calls, as well as any that does not have a
5046 // recorded callsite Call.
5047 if (Callee == Clone || !Callee->hasCall())
5048 continue;
5049 // Skip direct recursive calls. We don't need/want to clone the
5050 // caller node again, and this loop will not behave as expected if
5051 // we tried.
5052 if (Callee == CalleeEdge->Caller)
5053 continue;
5054 ContextNode *NewClone =
5055 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
5056 // Moving the edge may have resulted in some none type
5057 // callee edges on the original Callee.
5058 removeNoneTypeCalleeEdges(Callee);
5059 // Update NewClone with the new Call clone of this callsite's Call
5060 // created for the new function clone created earlier.
5061 // Recall that we have already ensured when building the graph
5062 // that each caller can only call callsites within the same
5063 // function, so we are guaranteed that Callee Call is in the
5064 // current OrigFunc.
5065 // CallMap is set up as indexed by original Call at clone 0.
5066 CallInfo OrigCall(Callee->getOrigNode()->Call);
5067 OrigCall.setCloneNo(0);
5068 DenseMap<CallInfo, CallInfo> &CallMap =
5069 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
5070 assert(CallMap.count(OrigCall));
5071 CallInfo NewCall(CallMap[OrigCall]);
5072 assert(NewCall);
5073 NewClone->setCall(NewCall);
5074 // Need to do the same for all matching calls.
5075 for (auto &MatchingCall : NewClone->MatchingCalls) {
5076 CallInfo OrigMatchingCall(MatchingCall);
5077 OrigMatchingCall.setCloneNo(0);
5078 assert(CallMap.count(OrigMatchingCall));
5079 CallInfo NewCall(CallMap[OrigMatchingCall]);
5080 assert(NewCall);
5081 // Updates the call in the list.
5082 MatchingCall = NewCall;
5083 }
5084 }
5085 }
5086 // Fall through to handling below to perform the recording of the
5087 // function for this callsite clone. This enables handling of cases
5088 // where the callers were assigned to different clones of a function.
5089 }
5090
5091 auto FindFirstAvailFuncClone = [&]() {
5092 // Find first function in FuncCloneInfos without an assigned
5093 // clone of this callsite Node. We should always have one
5094 // available at this point due to the earlier cloning when the
5095 // FuncCloneInfos size was smaller than the clone number.
5096 for (auto &CF : FuncCloneInfos) {
5097 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
5098 return CF.FuncClone;
5099 }
5101 "Expected an available func clone for this callsite clone");
5102 };
5103
5104 // See if we can use existing function clone. Walk through
5105 // all caller edges to see if any have already been assigned to
5106 // a clone of this callsite's function. If we can use it, do so. If not,
5107 // because that function clone is already assigned to a different clone
5108 // of this callsite, then we need to clone again.
5109 // Basically, this checking is needed to handle the case where different
5110 // caller functions/callsites may need versions of this function
5111 // containing different mixes of callsite clones across the different
5112 // callsites within the function. If that happens, we need to create
5113 // additional function clones to handle the various combinations.
5114 //
5115 // Keep track of any new clones of this callsite created by the
5116 // following loop, as well as any existing clone that we decided to
5117 // assign this clone to.
5118 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
5119 FuncInfo FuncCloneAssignedToCurCallsiteClone;
5120 // Iterate over a copy of Clone's caller edges, since we may need to
5121 // remove edges in the moveEdgeTo* methods, and this simplifies the
5122 // handling and makes it less error-prone.
5123 auto CloneCallerEdges = Clone->CallerEdges;
5124 for (auto &Edge : CloneCallerEdges) {
5125 // Skip removed edges (due to direct recursive edges updated when
5126 // updating callee edges when moving an edge and subsequently
5127 // removed by call to removeNoneTypeCalleeEdges on the Clone).
5128 if (Edge->isRemoved())
5129 continue;
5130 // Ignore any caller that does not have a recorded callsite Call.
5131 if (!Edge->Caller->hasCall())
5132 continue;
5133 // If this caller already assigned to call a version of OrigFunc, need
5134 // to ensure we can assign this callsite clone to that function clone.
5135 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
5136 FuncInfo FuncCloneCalledByCaller =
5137 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5138 // First we need to confirm that this function clone is available
5139 // for use by this callsite node clone.
5140 //
5141 // While FuncCloneToCurNodeCloneMap is built only for this Node and
5142 // its callsite clones, one of those callsite clones X could have
5143 // been assigned to the same function clone called by Edge's caller
5144 // - if Edge's caller calls another callsite within Node's original
5145 // function, and that callsite has another caller reaching clone X.
5146 // We need to clone Node again in this case.
5147 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
5148 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
5149 Clone) ||
5150 // Detect when we have multiple callers of this callsite that
5151 // have already been assigned to specific, and different, clones
5152 // of OrigFunc (due to other unrelated callsites in Func they
5153 // reach via call contexts). Is this Clone of callsite Node
5154 // assigned to a different clone of OrigFunc? If so, clone Node
5155 // again.
5156 (FuncCloneAssignedToCurCallsiteClone &&
5157 FuncCloneAssignedToCurCallsiteClone !=
5158 FuncCloneCalledByCaller)) {
5159 // We need to use a different newly created callsite clone, in
5160 // order to assign it to another new function clone on a
5161 // subsequent iteration over the Clones array (adjusted below).
5162 // Note we specifically do not reset the
5163 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
5164 // when this new clone is processed later we know which version of
5165 // the function to copy (so that other callsite clones we have
5166 // assigned to that function clone are properly cloned over). See
5167 // comments in the function cloning handling earlier.
5168
5169 // Check if we already have cloned this callsite again while
5170 // walking through caller edges, for a caller calling the same
5171 // function clone. If so, we can move this edge to that new clone
5172 // rather than creating yet another new clone.
5173 if (FuncCloneToNewCallsiteCloneMap.count(
5174 FuncCloneCalledByCaller)) {
5175 ContextNode *NewClone =
5176 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
5177 moveEdgeToExistingCalleeClone(Edge, NewClone);
5178 // Cleanup any none type edges cloned over.
5179 removeNoneTypeCalleeEdges(NewClone);
5180 } else {
5181 // Create a new callsite clone.
5182 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
5183 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
5184 NewClone;
5185 // Add to list of clones and process later.
5186 ClonesWorklist.push_back(NewClone);
5187 }
5188 // Moving the caller edge may have resulted in some none type
5189 // callee edges.
5190 removeNoneTypeCalleeEdges(Clone);
5191 // We will handle the newly created callsite clone in a subsequent
5192 // iteration over this Node's Clones.
5193 continue;
5194 }
5195
5196 // Otherwise, we can use the function clone already assigned to this
5197 // caller.
5198 if (!FuncCloneAssignedToCurCallsiteClone) {
5199 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
5200 // Assign Clone to FuncCloneCalledByCaller
5201 AssignCallsiteCloneToFuncClone(
5202 FuncCloneCalledByCaller, Call, Clone,
5203 AllocationCallToContextNodeMap.count(Call));
5204 } else
5205 // Don't need to do anything - callsite is already calling this
5206 // function clone.
5207 assert(FuncCloneAssignedToCurCallsiteClone ==
5208 FuncCloneCalledByCaller);
5209
5210 } else {
5211 // We have not already assigned this caller to a version of
5212 // OrigFunc. Do the assignment now.
5213
5214 // First check if we have already assigned this callsite clone to a
5215 // clone of OrigFunc for another caller during this iteration over
5216 // its caller edges.
5217 if (!FuncCloneAssignedToCurCallsiteClone) {
5218 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5219 assert(FuncCloneAssignedToCurCallsiteClone);
5220 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
5221 AssignCallsiteCloneToFuncClone(
5222 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5223 AllocationCallToContextNodeMap.count(Call));
5224 } else
5225 assert(FuncCloneToCurNodeCloneMap
5226 [FuncCloneAssignedToCurCallsiteClone] == Clone);
5227 // Update callers to record function version called.
5228 RecordCalleeFuncOfCallsite(Edge->Caller,
5229 FuncCloneAssignedToCurCallsiteClone);
5230 }
5231 }
5232 // If we didn't assign a function clone to this callsite clone yet, e.g.
5233 // none of its callers has a non-null call, do the assignment here.
5234 // We want to ensure that every callsite clone is assigned to some
5235 // function clone, so that the call updates below work as expected.
5236 // In particular if this is the original callsite, we want to ensure it
5237 // is assigned to the original function, otherwise the original function
5238 // will appear available for assignment to other callsite clones,
5239 // leading to unintended effects. For one, the unknown and not updated
5240 // callers will call into cloned paths leading to the wrong hints,
5241 // because they still call the original function (clone 0). Also,
5242 // because all callsites start out as being clone 0 by default, we can't
5243 // easily distinguish between callsites explicitly assigned to clone 0
5244 // vs those never assigned, which can lead to multiple updates of the
5245 // calls when invoking updateCall below, with mismatched clone values.
5246 // TODO: Add a flag to the callsite nodes or some other mechanism to
5247 // better distinguish and identify callsite clones that are not getting
5248 // assigned to function clones as expected.
5249 if (!FuncCloneAssignedToCurCallsiteClone) {
5250 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5251 assert(FuncCloneAssignedToCurCallsiteClone &&
5252 "No available func clone for this callsite clone");
5253 AssignCallsiteCloneToFuncClone(
5254 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5255 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5256 }
5257 }
5258 if (VerifyCCG) {
5260 for (const auto &PE : Node->CalleeEdges)
5262 for (const auto &CE : Node->CallerEdges)
5264 for (auto *Clone : Node->Clones) {
5266 for (const auto &PE : Clone->CalleeEdges)
5268 for (const auto &CE : Clone->CallerEdges)
5270 }
5271 }
5272 }
5273
5274 if (FuncCloneInfos.size() < 2)
5275 continue;
5276
5277 // In this case there is more than just the original function copy.
5278 // Record call clones of any callsite nodes in the function that did not
5279 // themselves get cloned for all of the function clones.
5280 for (auto &Call : CallsWithMetadata) {
5281 ContextNode *Node = getNodeForInst(Call);
5282 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5283 continue;
5284 // If Node has enough clones already to cover all function clones, we can
5285 // skip it. Need to add one for the original copy.
5286 // Use >= in case there were clones that were skipped due to having empty
5287 // context ids
5288 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5289 continue;
5290 // First collect all function clones we cloned this callsite node for.
5291 // They may not be sequential due to empty clones e.g.
5292 DenseSet<unsigned> NodeCallClones;
5293 for (auto *C : Node->Clones)
5294 NodeCallClones.insert(C->Call.cloneNo());
5295 unsigned I = 0;
5296 // Now check all the function clones.
5297 for (auto &FC : FuncCloneInfos) {
5298 // Function clones should be sequential.
5299 assert(FC.FuncClone.cloneNo() == I);
5300 // Skip the first clone which got the original call.
5301 // Also skip any other clones created for this Node.
5302 if (++I == 1 || NodeCallClones.contains(I)) {
5303 continue;
5304 }
5305 // Record the call clones created for this callsite in this function
5306 // clone.
5307 auto &CallVector = UnassignedCallClones[Node][I];
5308 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5309 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5310 CallInfo CallClone = It->second;
5311 CallVector.push_back(CallClone);
5312 } else {
5313 // All but the original clone (skipped earlier) should have an entry
5314 // for all calls.
5315 assert(false && "Expected to find call in CallMap");
5316 }
5317 // Need to do the same for all matching calls.
5318 for (auto &MatchingCall : Node->MatchingCalls) {
5319 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5320 CallInfo CallClone = It->second;
5321 CallVector.push_back(CallClone);
5322 } else {
5323 // All but the original clone (skipped earlier) should have an entry
5324 // for all calls.
5325 assert(false && "Expected to find call in CallMap");
5326 }
5327 }
5328 }
5329 }
5330 }
5331
5332 uint8_t BothTypes =
5333 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5334
5335 auto UpdateCalls = [&](ContextNode *Node,
5336 DenseSet<const ContextNode *> &Visited,
5337 auto &&UpdateCalls) {
5338 auto Inserted = Visited.insert(Node);
5339 if (!Inserted.second)
5340 return;
5341
5342 for (auto *Clone : Node->Clones)
5343 UpdateCalls(Clone, Visited, UpdateCalls);
5344
5345 for (auto &Edge : Node->CallerEdges)
5346 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5347
5348 // Skip if either no call to update, or if we ended up with no context ids
5349 // (we moved all edges onto other clones).
5350 if (!Node->hasCall() || Node->emptyContextIds())
5351 return;
5352
5353 if (Node->IsAllocation) {
5354 auto AT = allocTypeToUse(Node->AllocTypes);
5355 // If the allocation type is ambiguous, and more aggressive hinting
5356 // has been enabled via the MinClonedColdBytePercent flag, see if this
5357 // allocation should be hinted cold anyway because its fraction cold bytes
5358 // allocated is at least the given threshold.
5359 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5360 !ContextIdToContextSizeInfos.empty()) {
5361 uint64_t TotalCold = 0;
5362 uint64_t Total = 0;
5363 for (auto Id : Node->getContextIds()) {
5364 auto TypeI = ContextIdToAllocationType.find(Id);
5365 assert(TypeI != ContextIdToAllocationType.end());
5366 auto CSI = ContextIdToContextSizeInfos.find(Id);
5367 if (CSI != ContextIdToContextSizeInfos.end()) {
5368 for (auto &Info : CSI->second) {
5369 Total += Info.TotalSize;
5370 if (TypeI->second == AllocationType::Cold)
5371 TotalCold += Info.TotalSize;
5372 }
5373 }
5374 }
5375 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5376 AT = AllocationType::Cold;
5377 }
5378 updateAllocationCall(Node->Call, AT);
5379 assert(Node->MatchingCalls.empty());
5380 return;
5381 }
5382
5383 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5384 return;
5385
5386 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5387 updateCall(Node->Call, CalleeFunc);
5388 // Update all the matching calls as well.
5389 for (auto &Call : Node->MatchingCalls)
5390 updateCall(Call, CalleeFunc);
5391
5392 // Now update all calls recorded earlier that are still in function clones
5393 // which don't have a clone of this callsite node.
5394 if (!UnassignedCallClones.contains(Node))
5395 return;
5396 DenseSet<unsigned> NodeCallClones;
5397 for (auto *C : Node->Clones)
5398 NodeCallClones.insert(C->Call.cloneNo());
5399 // Note that we already confirmed Node is in this map a few lines above.
5400 auto &ClonedCalls = UnassignedCallClones[Node];
5401 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5402 // Should start at 1 as we never create an entry for original node.
5403 assert(CloneNo > 0);
5404 // If we subsequently created a clone, skip this one.
5405 if (NodeCallClones.contains(CloneNo))
5406 continue;
5407 // Use the original Node's CalleeFunc.
5408 for (auto &Call : CallVector)
5409 updateCall(Call, CalleeFunc);
5410 }
5411 };
5412
5413 // Performs DFS traversal starting from allocation nodes to update calls to
5414 // reflect cloning decisions recorded earlier. For regular LTO this will
5415 // update the actual calls in the IR to call the appropriate function clone
5416 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5417 // are recorded in the summary entries.
5418 DenseSet<const ContextNode *> Visited;
5419 for (auto &Entry : AllocationCallToContextNodeMap)
5420 UpdateCalls(Entry.second, Visited, UpdateCalls);
5421
5422 return Changed;
5423}
5424
5425// Compute a SHA1 hash of the callsite and alloc version information of clone I
5426// in the summary, to use in detection of duplicate clones.
5428 SHA1 Hasher;
5429 // Update hash with any callsites that call non-default (non-zero) callee
5430 // versions.
5431 for (auto &SN : FS->callsites()) {
5432 // In theory all callsites and allocs in this function should have the same
5433 // number of clone entries, but handle any discrepancies gracefully below
5434 // for NDEBUG builds.
5435 assert(
5436 SN.Clones.size() > I &&
5437 "Callsite summary has fewer entries than other summaries in function");
5438 if (SN.Clones.size() <= I || !SN.Clones[I])
5439 continue;
5440 uint8_t Data[sizeof(SN.Clones[I])];
5441 support::endian::write32le(Data, SN.Clones[I]);
5442 Hasher.update(Data);
5443 }
5444 // Update hash with any allocs that have non-default (non-None) hints.
5445 for (auto &AN : FS->allocs()) {
5446 // In theory all callsites and allocs in this function should have the same
5447 // number of clone entries, but handle any discrepancies gracefully below
5448 // for NDEBUG builds.
5449 assert(AN.Versions.size() > I &&
5450 "Alloc summary has fewer entries than other summaries in function");
5451 if (AN.Versions.size() <= I ||
5452 (AllocationType)AN.Versions[I] == AllocationType::None)
5453 continue;
5454 Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
5455 }
5456 return support::endian::read64le(Hasher.result().data());
5457}
5458
5460 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5462 &FuncToAliasMap,
5463 FunctionSummary *FS) {
5464 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5465 // We might have created this when adjusting callsite in another
5466 // function. It should be a declaration.
5467 assert(DeclGV->isDeclaration());
5468 NewGV->takeName(DeclGV);
5469 DeclGV->replaceAllUsesWith(NewGV);
5470 DeclGV->eraseFromParent();
5471 };
5472
5473 // Handle aliases to this function, and create analogous alias clones to the
5474 // provided clone of this function.
5475 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5476 if (!FuncToAliasMap.count(&F))
5477 return;
5478 for (auto *A : FuncToAliasMap[&F]) {
5479 std::string AliasName = getMemProfFuncName(A->getName(), I);
5480 auto *PrevA = M.getNamedAlias(AliasName);
5481 auto *NewA = GlobalAlias::create(A->getValueType(),
5482 A->getType()->getPointerAddressSpace(),
5483 A->getLinkage(), AliasName, NewF);
5484 NewA->copyAttributesFrom(A);
5485 if (PrevA)
5486 TakeDeclNameAndReplace(PrevA, NewA);
5487 }
5488 };
5489
5490 // The first "clone" is the original copy, we should only call this if we
5491 // needed to create new clones.
5492 assert(NumClones > 1);
5494 VMaps.reserve(NumClones - 1);
5495 FunctionsClonedThinBackend++;
5496
5497 // Map of hash of callsite/alloc versions to the instantiated function clone
5498 // (possibly the original) implementing those calls. Used to avoid
5499 // instantiating duplicate function clones.
5500 // FIXME: Ideally the thin link would not generate such duplicate clones to
5501 // start with, but right now it happens due to phase ordering in the function
5502 // assignment and possible new clones that produces. We simply make each
5503 // duplicate an alias to the matching instantiated clone recorded in the map
5504 // (except for available_externally which are made declarations as they would
5505 // be aliases in the prevailing module, and available_externally aliases are
5506 // not well supported right now).
5508
5509 // Save the hash of the original function version.
5510 HashToFunc[ComputeHash(FS, 0)] = &F;
5511
5512 for (unsigned I = 1; I < NumClones; I++) {
5513 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5514 std::string Name = getMemProfFuncName(F.getName(), I);
5515 auto Hash = ComputeHash(FS, I);
5516 // If this clone would duplicate a previously seen clone, don't generate the
5517 // duplicate clone body, just make an alias to satisfy any (potentially
5518 // cross-module) references.
5519 if (HashToFunc.contains(Hash)) {
5520 FunctionCloneDuplicatesThinBackend++;
5521 auto *Func = HashToFunc[Hash];
5522 if (Func->hasAvailableExternallyLinkage()) {
5523 // Skip these as EliminateAvailableExternallyPass does not handle
5524 // available_externally aliases correctly and we end up with an
5525 // available_externally alias to a declaration. Just create a
5526 // declaration for now as we know we will have a definition in another
5527 // module.
5528 auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
5529 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5530 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5531 continue;
5532 }
5533 auto *PrevF = M.getFunction(Name);
5534 auto *Alias = GlobalAlias::create(Name, Func);
5535 if (PrevF)
5536 TakeDeclNameAndReplace(PrevF, Alias);
5537 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5538 << "created clone alias " << ore::NV("Alias", Alias));
5539
5540 // Now handle aliases to this function, and clone those as well.
5541 CloneFuncAliases(Func, I);
5542 continue;
5543 }
5544 auto *NewF = CloneFunction(&F, *VMaps.back());
5545 HashToFunc[Hash] = NewF;
5546 FunctionClonesThinBackend++;
5547 // Strip memprof and callsite metadata from clone as they are no longer
5548 // needed.
5549 for (auto &BB : *NewF) {
5550 for (auto &Inst : BB) {
5551 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5552 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5553 }
5554 }
5555 auto *PrevF = M.getFunction(Name);
5556 if (PrevF)
5557 TakeDeclNameAndReplace(PrevF, NewF);
5558 else
5559 NewF->setName(Name);
5560 updateSubprogramLinkageName(NewF, Name);
5561 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5562 << "created clone " << ore::NV("NewFunction", NewF));
5563
5564 // Now handle aliases to this function, and clone those as well.
5565 CloneFuncAliases(NewF, I);
5566 }
5567 return VMaps;
5568}
5569
5570// Locate the summary for F. This is complicated by the fact that it might
5571// have been internalized or promoted.
5573 const ModuleSummaryIndex *ImportSummary,
5574 const Function *CallingFunc = nullptr) {
5575 // FIXME: Ideally we would retain the original GUID in some fashion on the
5576 // function (e.g. as metadata), but for now do our best to locate the
5577 // summary without that information.
5578 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5579 if (!TheFnVI)
5580 // See if theFn was internalized, by checking index directly with
5581 // original name (this avoids the name adjustment done by getGUID() for
5582 // internal symbols).
5583 TheFnVI = ImportSummary->getValueInfo(
5585 if (TheFnVI)
5586 return TheFnVI;
5587 // Now query with the original name before any promotion was performed.
5588 StringRef OrigName =
5590 // When this pass is enabled, we always add thinlto_src_file provenance
5591 // metadata to imported function definitions, which allows us to recreate the
5592 // original internal symbol's GUID.
5593 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5594 // If this is a call to an imported/promoted local for which we didn't import
5595 // the definition, the metadata will not exist on the declaration. However,
5596 // since we are doing this early, before any inlining in the LTO backend, we
5597 // can simply look at the metadata on the calling function which must have
5598 // been from the same module if F was an internal symbol originally.
5599 if (!SrcFileMD && F.isDeclaration()) {
5600 // We would only call this for a declaration for a direct callsite, in which
5601 // case the caller would have provided the calling function pointer.
5602 assert(CallingFunc);
5603 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5604 // If this is a promoted local (OrigName != F.getName()), since this is a
5605 // declaration, it must be imported from a different module and therefore we
5606 // should always find the metadata on its calling function. Any call to a
5607 // promoted local that came from this module should still be a definition.
5608 assert(SrcFileMD || OrigName == F.getName());
5609 }
5610 StringRef SrcFile = M.getSourceFileName();
5611 if (SrcFileMD)
5612 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5613 std::string OrigId = GlobalValue::getGlobalIdentifier(
5614 OrigName, GlobalValue::InternalLinkage, SrcFile);
5615 TheFnVI = ImportSummary->getValueInfo(
5617 // Internal func in original module may have gotten a numbered suffix if we
5618 // imported an external function with the same name. This happens
5619 // automatically during IR linking for naming conflicts. It would have to
5620 // still be internal in that case (otherwise it would have been renamed on
5621 // promotion in which case we wouldn't have a naming conflict).
5622 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5623 F.getName().contains('.')) {
5624 OrigName = F.getName().rsplit('.').first;
5626 OrigName, GlobalValue::InternalLinkage, SrcFile);
5627 TheFnVI = ImportSummary->getValueInfo(
5629 }
5630 // The only way we may not have a VI is if this is a declaration created for
5631 // an imported reference. For distributed ThinLTO we may not have a VI for
5632 // such declarations in the distributed summary.
5633 assert(TheFnVI || F.isDeclaration());
5634 return TheFnVI;
5635}
5636
5637bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5638 Module &M) {
5639 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5640 Symtab = std::make_unique<InstrProfSymtab>();
5641 // Don't add canonical names, to avoid multiple functions to the symtab
5642 // when they both have the same root name with "." suffixes stripped.
5643 // If we pick the wrong one then this could lead to incorrect ICP and calling
5644 // a memprof clone that we don't actually create (resulting in linker unsats).
5645 // What this means is that the GUID of the function (or its PGOFuncName
5646 // metadata) *must* match that in the VP metadata to allow promotion.
5647 // In practice this should not be a limitation, since local functions should
5648 // have PGOFuncName metadata and global function names shouldn't need any
5649 // special handling (they should not get the ".llvm.*" suffix that the
5650 // canonicalization handling is attempting to strip).
5651 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5652 std::string SymtabFailure = toString(std::move(E));
5653 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5654 return false;
5655 }
5656 return true;
5657}
5658
5659#ifndef NDEBUG
5660// Sanity check that the MIB stack ids match between the summary and
5661// instruction metadata.
5663 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5664 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5665 const ModuleSummaryIndex *ImportSummary) {
5666 auto MIBIter = AllocNode.MIBs.begin();
5667 for (auto &MDOp : MemProfMD->operands()) {
5668 assert(MIBIter != AllocNode.MIBs.end());
5669 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5670 auto *MIBMD = cast<const MDNode>(MDOp);
5671 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5672 assert(StackMDNode);
5673 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5674 auto ContextIterBegin =
5675 StackContext.beginAfterSharedPrefix(CallsiteContext);
5676 // Skip the checking on the first iteration.
5677 uint64_t LastStackContextId =
5678 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5679 : 0;
5680 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5681 ++ContextIter) {
5682 // If this is a direct recursion, simply skip the duplicate
5683 // entries, to be consistent with how the summary ids were
5684 // generated during ModuleSummaryAnalysis.
5685 if (LastStackContextId == *ContextIter)
5686 continue;
5687 LastStackContextId = *ContextIter;
5688 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5689 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5690 *ContextIter);
5691 StackIdIndexIter++;
5692 }
5693 MIBIter++;
5694 }
5695}
5696#endif
5697
5698bool MemProfContextDisambiguation::applyImport(Module &M) {
5699 assert(ImportSummary);
5700 bool Changed = false;
5701
5702 // We also need to clone any aliases that reference cloned functions, because
5703 // the modified callsites may invoke via the alias. Keep track of the aliases
5704 // for each function.
5705 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5706 FuncToAliasMap;
5707 for (auto &A : M.aliases()) {
5708 auto *Aliasee = A.getAliaseeObject();
5709 if (auto *F = dyn_cast<Function>(Aliasee))
5710 FuncToAliasMap[F].insert(&A);
5711 }
5712
5713 if (!initializeIndirectCallPromotionInfo(M))
5714 return false;
5715
5716 for (auto &F : M) {
5717 if (F.isDeclaration() || isMemProfClone(F))
5718 continue;
5719
5720 OptimizationRemarkEmitter ORE(&F);
5721
5723 bool ClonesCreated = false;
5724 unsigned NumClonesCreated = 0;
5725 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5726 // We should at least have version 0 which is the original copy.
5727 assert(NumClones > 0);
5728 // If only one copy needed use original.
5729 if (NumClones == 1)
5730 return;
5731 // If we already performed cloning of this function, confirm that the
5732 // requested number of clones matches (the thin link should ensure the
5733 // number of clones for each constituent callsite is consistent within
5734 // each function), before returning.
5735 if (ClonesCreated) {
5736 assert(NumClonesCreated == NumClones);
5737 return;
5738 }
5739 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5740 // The first "clone" is the original copy, which doesn't have a VMap.
5741 assert(VMaps.size() == NumClones - 1);
5742 Changed = true;
5743 ClonesCreated = true;
5744 NumClonesCreated = NumClones;
5745 };
5746
5747 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5748 Function *CalledFunction, FunctionSummary *FS) {
5749 // Perform cloning if not yet done.
5750 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5751
5752 assert(!isMemProfClone(*CalledFunction));
5753
5754 // Because we update the cloned calls by calling setCalledOperand (see
5755 // comment below), out of an abundance of caution make sure the called
5756 // function was actually the called operand (or its aliasee). We also
5757 // strip pointer casts when looking for calls (to match behavior during
5758 // summary generation), however, with opaque pointers in theory this
5759 // should not be an issue. Note we still clone the current function
5760 // (containing this call) above, as that could be needed for its callers.
5761 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5762 if (CalledFunction != CB->getCalledOperand() &&
5763 (!GA || CalledFunction != GA->getAliaseeObject())) {
5764 SkippedCallsCloning++;
5765 return;
5766 }
5767 // Update the calls per the summary info.
5768 // Save orig name since it gets updated in the first iteration
5769 // below.
5770 auto CalleeOrigName = CalledFunction->getName();
5771 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5772 // If the VMap is empty, this clone was a duplicate of another and was
5773 // created as an alias or a declaration.
5774 if (J > 0 && VMaps[J - 1]->empty())
5775 continue;
5776 // Do nothing if this version calls the original version of its
5777 // callee.
5778 if (!StackNode.Clones[J])
5779 continue;
5780 auto NewF = M.getOrInsertFunction(
5781 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5782 CalledFunction->getFunctionType());
5783 CallBase *CBClone;
5784 // Copy 0 is the original function.
5785 if (!J)
5786 CBClone = CB;
5787 else
5788 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5789 // Set the called operand directly instead of calling setCalledFunction,
5790 // as the latter mutates the function type on the call. In rare cases
5791 // we may have a slightly different type on a callee function
5792 // declaration due to it being imported from a different module with
5793 // incomplete types. We really just want to change the name of the
5794 // function to the clone, and not make any type changes.
5795 CBClone->setCalledOperand(NewF.getCallee());
5796 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5797 << ore::NV("Call", CBClone) << " in clone "
5798 << ore::NV("Caller", CBClone->getFunction())
5799 << " assigned to call function clone "
5800 << ore::NV("Callee", NewF.getCallee()));
5801 }
5802 };
5803
5804 // Locate the summary for F.
5805 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5806 // If not found, this could be an imported local (see comment in
5807 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5808 // module (where it would have been promoted to global scope so should
5809 // satisfy any reference in this module).
5810 if (!TheFnVI)
5811 continue;
5812
5813 auto *GVSummary =
5814 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5815 if (!GVSummary) {
5816 // Must have been imported, use the summary which matches the definition。
5817 // (might be multiple if this was a linkonce_odr).
5818 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5819 assert(SrcModuleMD &&
5820 "enable-import-metadata is needed to emit thinlto_src_module");
5821 StringRef SrcModule =
5822 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5823 for (auto &GVS : TheFnVI.getSummaryList()) {
5824 if (GVS->modulePath() == SrcModule) {
5825 GVSummary = GVS.get();
5826 break;
5827 }
5828 }
5829 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5830 }
5831
5832 // If this was an imported alias skip it as we won't have the function
5833 // summary, and it should be cloned in the original module.
5834 if (isa<AliasSummary>(GVSummary))
5835 continue;
5836
5837 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5838
5839 if (FS->allocs().empty() && FS->callsites().empty())
5840 continue;
5841
5842 auto SI = FS->callsites().begin();
5843 auto AI = FS->allocs().begin();
5844
5845 // To handle callsite infos synthesized for tail calls which have missing
5846 // frames in the profiled context, map callee VI to the synthesized callsite
5847 // info.
5848 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5849 // Iterate the callsites for this function in reverse, since we place all
5850 // those synthesized for tail calls at the end.
5851 for (auto CallsiteIt = FS->callsites().rbegin();
5852 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5853 auto &Callsite = *CallsiteIt;
5854 // Stop as soon as we see a non-synthesized callsite info (see comment
5855 // above loop). All the entries added for discovered tail calls have empty
5856 // stack ids.
5857 if (!Callsite.StackIdIndices.empty())
5858 break;
5859 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5860 }
5861
5862 // Keeps track of needed ICP for the function.
5863 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5864
5865 // Assume for now that the instructions are in the exact same order
5866 // as when the summary was created, but confirm this is correct by
5867 // matching the stack ids.
5868 for (auto &BB : F) {
5869 for (auto &I : BB) {
5870 auto *CB = dyn_cast<CallBase>(&I);
5871 // Same handling as when creating module summary.
5872 if (!mayHaveMemprofSummary(CB))
5873 continue;
5874
5875 auto *CalledValue = CB->getCalledOperand();
5876 auto *CalledFunction = CB->getCalledFunction();
5877 if (CalledValue && !CalledFunction) {
5878 CalledValue = CalledValue->stripPointerCasts();
5879 // Stripping pointer casts can reveal a called function.
5880 CalledFunction = dyn_cast<Function>(CalledValue);
5881 }
5882 // Check if this is an alias to a function. If so, get the
5883 // called aliasee for the checks below.
5884 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5885 assert(!CalledFunction &&
5886 "Expected null called function in callsite for alias");
5887 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5888 }
5889
5890 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5891 I.getMetadata(LLVMContext::MD_callsite));
5892 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5893
5894 // Include allocs that were already assigned a memprof function
5895 // attribute in the statistics. Only do this for those that do not have
5896 // memprof metadata, since we add an "ambiguous" memprof attribute by
5897 // default.
5898 if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
5899 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5900 ? AllocTypeColdThinBackend++
5901 : AllocTypeNotColdThinBackend++;
5902 OrigAllocsThinBackend++;
5903 AllocVersionsThinBackend++;
5904 if (!MaxAllocVersionsThinBackend)
5905 MaxAllocVersionsThinBackend = 1;
5906 continue;
5907 }
5908
5909 if (MemProfMD) {
5910 // Consult the next alloc node.
5911 assert(AI != FS->allocs().end());
5912 auto &AllocNode = *(AI++);
5913
5914#ifndef NDEBUG
5915 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5916 ImportSummary);
5917#endif
5918
5919 // Perform cloning if not yet done.
5920 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5921
5922 OrigAllocsThinBackend++;
5923 AllocVersionsThinBackend += AllocNode.Versions.size();
5924 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5925 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5926
5927 // If there is only one version that means we didn't end up
5928 // considering this function for cloning, and in that case the alloc
5929 // will still be none type or should have gotten the default NotCold.
5930 // Skip that after calling clone helper since that does some sanity
5931 // checks that confirm we haven't decided yet that we need cloning.
5932 // We might have a single version that is cold due to the
5933 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5934 // case.
5935 if (AllocNode.Versions.size() == 1 &&
5936 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5937 assert((AllocationType)AllocNode.Versions[0] ==
5938 AllocationType::NotCold ||
5939 (AllocationType)AllocNode.Versions[0] ==
5940 AllocationType::None);
5941 UnclonableAllocsThinBackend++;
5942 continue;
5943 }
5944
5945 // All versions should have a singular allocation type.
5946 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5947 return Type == ((uint8_t)AllocationType::NotCold |
5948 (uint8_t)AllocationType::Cold);
5949 }));
5950
5951 // Update the allocation types per the summary info.
5952 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5953 // If the VMap is empty, this clone was a duplicate of another and
5954 // was created as an alias or a declaration.
5955 if (J > 0 && VMaps[J - 1]->empty())
5956 continue;
5957 // Ignore any that didn't get an assigned allocation type.
5958 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5959 continue;
5960 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5961 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5962 : AllocTypeNotColdThinBackend++;
5963 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
5964 auto A = llvm::Attribute::get(F.getContext(), "memprof",
5965 AllocTypeString);
5966 CallBase *CBClone;
5967 // Copy 0 is the original function.
5968 if (!J)
5969 CBClone = CB;
5970 else
5971 // Since VMaps are only created for new clones, we index with
5972 // clone J-1 (J==0 is the original clone and does not have a VMaps
5973 // entry).
5974 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5976 CBClone->addFnAttr(A);
5977 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5978 << ore::NV("AllocationCall", CBClone) << " in clone "
5979 << ore::NV("Caller", CBClone->getFunction())
5980 << " marked with memprof allocation attribute "
5981 << ore::NV("Attribute", AllocTypeString));
5982 }
5983 } else if (!CallsiteContext.empty()) {
5984 if (!CalledFunction) {
5985#ifndef NDEBUG
5986 // We should have skipped inline assembly calls.
5987 auto *CI = dyn_cast<CallInst>(CB);
5988 assert(!CI || !CI->isInlineAsm());
5989#endif
5990 // We should have skipped direct calls via a Constant.
5991 assert(CalledValue && !isa<Constant>(CalledValue));
5992
5993 // This is an indirect call, see if we have profile information and
5994 // whether any clones were recorded for the profiled targets (that
5995 // we synthesized CallsiteInfo summary records for when building the
5996 // index).
5997 auto NumClones =
5998 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
5999
6000 // Perform cloning if not yet done. This is done here in case
6001 // we don't need to do ICP, but might need to clone this
6002 // function as it is the target of other cloned calls.
6003 if (NumClones)
6004 CloneFuncIfNeeded(NumClones, FS);
6005 }
6006
6007 else {
6008 // Consult the next callsite node.
6009 assert(SI != FS->callsites().end());
6010 auto &StackNode = *(SI++);
6011
6012#ifndef NDEBUG
6013 // Sanity check that the stack ids match between the summary and
6014 // instruction metadata.
6015 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
6016 for (auto StackId : CallsiteContext) {
6017 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
6018 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
6019 StackId);
6020 StackIdIndexIter++;
6021 }
6022#endif
6023
6024 CloneCallsite(StackNode, CB, CalledFunction, FS);
6025 }
6026 } else if (CB->isTailCall() && CalledFunction) {
6027 // Locate the synthesized callsite info for the callee VI, if any was
6028 // created, and use that for cloning.
6029 ValueInfo CalleeVI =
6030 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
6031 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
6032 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
6033 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
6034 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
6035 }
6036 }
6037 }
6038 }
6039
6040 // Now do any promotion required for cloning.
6041 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
6042 }
6043
6044 // We skip some of the functions and instructions above, so remove all the
6045 // metadata in a single sweep here.
6046 for (auto &F : M) {
6047 // We can skip memprof clones because createFunctionClones already strips
6048 // the metadata from the newly created clones.
6049 if (F.isDeclaration() || isMemProfClone(F))
6050 continue;
6051 for (auto &BB : F) {
6052 for (auto &I : BB) {
6053 if (!isa<CallBase>(I))
6054 continue;
6055 I.setMetadata(LLVMContext::MD_memprof, nullptr);
6056 I.setMetadata(LLVMContext::MD_callsite, nullptr);
6057 }
6058 }
6059 }
6060
6061 return Changed;
6062}
6063
6064unsigned MemProfContextDisambiguation::recordICPInfo(
6065 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
6067 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
6068 // First see if we have profile information for this indirect call.
6069 uint32_t NumCandidates;
6070 uint64_t TotalCount;
6071 auto CandidateProfileData =
6072 ICallAnalysis->getPromotionCandidatesForInstruction(
6073 CB, TotalCount, NumCandidates, MaxSummaryIndirectEdges);
6074 if (CandidateProfileData.empty())
6075 return 0;
6076
6077 // Iterate through all of the candidate profiled targets along with the
6078 // CallsiteInfo summary records synthesized for them when building the index,
6079 // and see if any are cloned and/or refer to clones.
6080 bool ICPNeeded = false;
6081 unsigned NumClones = 0;
6082 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
6083 for (const auto &Candidate : CandidateProfileData) {
6084#ifndef NDEBUG
6085 auto CalleeValueInfo =
6086#endif
6087 ImportSummary->getValueInfo(Candidate.Value);
6088 // We might not have a ValueInfo if this is a distributed
6089 // ThinLTO backend and decided not to import that function.
6090 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
6091 assert(SI != AllCallsites.end());
6092 auto &StackNode = *(SI++);
6093 // See if any of the clones of the indirect callsite for this
6094 // profiled target should call a cloned version of the profiled
6095 // target. We only need to do the ICP here if so.
6096 ICPNeeded |= llvm::any_of(StackNode.Clones,
6097 [](unsigned CloneNo) { return CloneNo != 0; });
6098 // Every callsite in the same function should have been cloned the same
6099 // number of times.
6100 assert(!NumClones || NumClones == StackNode.Clones.size());
6101 NumClones = StackNode.Clones.size();
6102 }
6103 if (!ICPNeeded)
6104 return NumClones;
6105 // Save information for ICP, which is performed later to avoid messing up the
6106 // current function traversal.
6107 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
6108 TotalCount, CallsiteInfoStartIndex});
6109 return NumClones;
6110}
6111
6112void MemProfContextDisambiguation::performICP(
6113 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
6114 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
6115 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
6116 OptimizationRemarkEmitter &ORE) {
6117 // Now do any promotion required for cloning. Specifically, for each
6118 // recorded ICP candidate (which was only recorded because one clone of that
6119 // candidate should call a cloned target), we perform ICP (speculative
6120 // devirtualization) for each clone of the callsite, and update its callee
6121 // to the appropriate clone. Note that the ICP compares against the original
6122 // version of the target, which is what is in the vtable.
6123 for (auto &Info : ICallAnalysisInfo) {
6124 auto *CB = Info.CB;
6125 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
6126 auto TotalCount = Info.TotalCount;
6127 unsigned NumPromoted = 0;
6128 unsigned NumClones = 0;
6129
6130 for (auto &Candidate : Info.CandidateProfileData) {
6131 auto &StackNode = AllCallsites[CallsiteIndex++];
6132
6133 // All calls in the same function must have the same number of clones.
6134 assert(!NumClones || NumClones == StackNode.Clones.size());
6135 NumClones = StackNode.Clones.size();
6136
6137 // See if the target is in the module. If it wasn't imported, it is
6138 // possible that this profile could have been collected on a different
6139 // target (or version of the code), and we need to be conservative
6140 // (similar to what is done in the ICP pass).
6141 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
6142 if (TargetFunction == nullptr ||
6143 // Any ThinLTO global dead symbol removal should have already
6144 // occurred, so it should be safe to promote when the target is a
6145 // declaration.
6146 // TODO: Remove internal option once more fully tested.
6148 TargetFunction->isDeclaration())) {
6149 ORE.emit([&]() {
6150 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
6151 << "Memprof cannot promote indirect call: target with md5sum "
6152 << ore::NV("target md5sum", Candidate.Value) << " not found";
6153 });
6154 // FIXME: See if we can use the new declaration importing support to
6155 // at least get the declarations imported for this case. Hot indirect
6156 // targets should have been imported normally, however.
6157 continue;
6158 }
6159
6160 // Check if legal to promote
6161 const char *Reason = nullptr;
6162 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
6163 ORE.emit([&]() {
6164 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
6165 << "Memprof cannot promote indirect call to "
6166 << ore::NV("TargetFunction", TargetFunction)
6167 << " with count of " << ore::NV("TotalCount", TotalCount)
6168 << ": " << Reason;
6169 });
6170 continue;
6171 }
6172
6173 assert(!isMemProfClone(*TargetFunction));
6174
6175 // Handle each call clone, applying ICP so that each clone directly
6176 // calls the specified callee clone, guarded by the appropriate ICP
6177 // check.
6178 CallBase *CBClone = CB;
6179 for (unsigned J = 0; J < NumClones; J++) {
6180 // If the VMap is empty, this clone was a duplicate of another and was
6181 // created as an alias or a declaration.
6182 if (J > 0 && VMaps[J - 1]->empty())
6183 continue;
6184 // Copy 0 is the original function.
6185 if (J > 0)
6186 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6187 // We do the promotion using the original name, so that the comparison
6188 // is against the name in the vtable. Then just below, change the new
6189 // direct call to call the cloned function.
6190 auto &DirectCall =
6191 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
6192 TotalCount, isSamplePGO, &ORE);
6193 auto *TargetToUse = TargetFunction;
6194 // Call original if this version calls the original version of its
6195 // callee.
6196 if (StackNode.Clones[J]) {
6197 TargetToUse =
6198 cast<Function>(M.getOrInsertFunction(
6199 getMemProfFuncName(TargetFunction->getName(),
6200 StackNode.Clones[J]),
6201 TargetFunction->getFunctionType())
6202 .getCallee());
6203 }
6204 DirectCall.setCalledFunction(TargetToUse);
6205 // During matching we generate synthetic VP metadata for indirect calls
6206 // not already having any, from the memprof profile's callee GUIDs. If
6207 // we subsequently promote and inline those callees, we currently lose
6208 // the ability to generate this synthetic VP metadata. Optionally apply
6209 // a noinline attribute to promoted direct calls, where the threshold is
6210 // set to capture synthetic VP metadata targets which get a count of 1.
6212 Candidate.Count < MemProfICPNoInlineThreshold)
6213 DirectCall.setIsNoInline();
6214 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
6215 << ore::NV("Call", CBClone) << " in clone "
6216 << ore::NV("Caller", CBClone->getFunction())
6217 << " promoted and assigned to call function clone "
6218 << ore::NV("Callee", TargetToUse));
6219 }
6220
6221 // Update TotalCount (all clones should get same count above)
6222 TotalCount -= Candidate.Count;
6223 NumPromoted++;
6224 }
6225 // Adjust the MD.prof metadata for all clones, now that we have the new
6226 // TotalCount and the number promoted.
6227 CallBase *CBClone = CB;
6228 for (unsigned J = 0; J < NumClones; J++) {
6229 // If the VMap is empty, this clone was a duplicate of another and was
6230 // created as an alias or a declaration.
6231 if (J > 0 && VMaps[J - 1]->empty())
6232 continue;
6233 // Copy 0 is the original function.
6234 if (J > 0)
6235 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6236 // First delete the old one.
6237 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
6238 // If all promoted, we don't need the MD.prof metadata.
6239 // Otherwise we need update with the un-promoted records back.
6240 if (TotalCount != 0)
6242 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
6243 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
6244 }
6245 }
6246}
6247
6248template <typename DerivedCCG, typename FuncTy, typename CallTy>
6249bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
6250 if (DumpCCG) {
6251 dbgs() << "CCG before cloning:\n";
6252 dbgs() << *this;
6253 }
6254 if (ExportToDot)
6255 exportToDot("postbuild");
6256
6257 if (VerifyCCG) {
6258 check();
6259 }
6260
6261 identifyClones();
6262
6263 if (VerifyCCG) {
6264 check();
6265 }
6266
6267 if (DumpCCG) {
6268 dbgs() << "CCG after cloning:\n";
6269 dbgs() << *this;
6270 }
6271 if (ExportToDot)
6272 exportToDot("cloned");
6273
6274 bool Changed = assignFunctions();
6275
6276 if (DumpCCG) {
6277 dbgs() << "CCG after assigning function clones:\n";
6278 dbgs() << *this;
6279 }
6280 if (ExportToDot)
6281 exportToDot("clonefuncassign");
6282
6284 printTotalSizes(errs());
6285
6286 return Changed;
6287}
6288
6289bool MemProfContextDisambiguation::processModule(
6290 Module &M,
6291 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6292
6293 // If we have an import summary, then the cloning decisions were made during
6294 // the thin link on the index. Apply them and return.
6295 if (ImportSummary)
6296 return applyImport(M);
6297
6298 // TODO: If/when other types of memprof cloning are enabled beyond just for
6299 // hot and cold, we will need to change this to individually control the
6300 // AllocationType passed to addStackNodesForMIB during CCG construction.
6301 // Note that we specifically check this after applying imports above, so that
6302 // the option isn't needed to be passed to distributed ThinLTO backend
6303 // clang processes, which won't necessarily have visibility into the linker
6304 // dependences. Instead the information is communicated from the LTO link to
6305 // the backends via the combined summary index.
6306 if (!SupportsHotColdNew)
6307 return false;
6308
6309 ModuleCallsiteContextGraph CCG(M, OREGetter);
6310 return CCG.process();
6311}
6312
6314 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6315 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6316 // Check the dot graph printing options once here, to make sure we have valid
6317 // and expected combinations.
6318 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6320 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6322 !ContextIdForDot.getNumOccurrences())
6324 "-memprof-dot-scope=context requires -memprof-dot-context-id");
6325 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6326 ContextIdForDot.getNumOccurrences())
6328 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6329 "-memprof-dot-context-id");
6330 if (ImportSummary) {
6331 // The MemProfImportSummary should only be used for testing ThinLTO
6332 // distributed backend handling via opt, in which case we don't have a
6333 // summary from the pass pipeline.
6335 return;
6336 }
6337 if (MemProfImportSummary.empty())
6338 return;
6339
6340 auto ReadSummaryFile =
6342 if (!ReadSummaryFile) {
6343 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
6344 "Error loading file '" + MemProfImportSummary +
6345 "': ");
6346 return;
6347 }
6348 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
6349 if (!ImportSummaryForTestingOrErr) {
6350 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
6351 "Error parsing file '" + MemProfImportSummary +
6352 "': ");
6353 return;
6354 }
6355 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6356 ImportSummary = ImportSummaryForTesting.get();
6357}
6358
6361 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
6362 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6363 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
6364 };
6365 if (!processModule(M, OREGetter))
6366 return PreservedAnalyses::all();
6367 return PreservedAnalyses::none();
6368}
6369
6371 ModuleSummaryIndex &Index,
6373 isPrevailing) {
6374 // TODO: If/when other types of memprof cloning are enabled beyond just for
6375 // hot and cold, we will need to change this to individually control the
6376 // AllocationType passed to addStackNodesForMIB during CCG construction.
6377 // The index was set from the option, so these should be in sync.
6378 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6379 if (!SupportsHotColdNew)
6380 return;
6381
6382 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6383 CCG.process();
6384}
6385
6386// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
6387// when we don't have an index that has recorded that we are linking with
6388// allocation libraries containing the necessary APIs for downstream
6389// transformations.
6391 // The profile matcher applies hotness attributes directly for allocations,
6392 // and those will cause us to generate calls to the hot/cold interfaces
6393 // unconditionally. If supports-hot-cold-new was not enabled in the LTO
6394 // link then assume we don't want these calls (e.g. not linking with
6395 // the appropriate library, or otherwise trying to disable this behavior).
6396 bool Changed = false;
6397 for (auto &F : M) {
6398 for (auto &BB : F) {
6399 for (auto &I : BB) {
6400 auto *CI = dyn_cast<CallBase>(&I);
6401 if (!CI)
6402 continue;
6403 if (CI->hasFnAttr("memprof")) {
6404 CI->removeFnAttr("memprof");
6405 Changed = true;
6406 }
6407 if (!CI->hasMetadata(LLVMContext::MD_callsite)) {
6408 assert(!CI->hasMetadata(LLVMContext::MD_memprof));
6409 continue;
6410 }
6411 // Strip off all memprof metadata as it is no longer needed.
6412 // Importantly, this avoids the addition of new memprof attributes
6413 // after inlining propagation.
6414 CI->setMetadata(LLVMContext::MD_memprof, nullptr);
6415 CI->setMetadata(LLVMContext::MD_callsite, nullptr);
6416 Changed = true;
6417 }
6418 }
6419 }
6420 if (!Changed)
6421 return PreservedAnalyses::all();
6422 return PreservedAnalyses::none();
6423}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
uint64_t ComputeHash(const FunctionSummary *FS, unsigned I)
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap, FunctionSummary *FS)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
void print(OutputBuffer &OB) const
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
const_pointer iterator
Definition ArrayRef.h:47
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
DISubprogram * getSubprogram() const
Get the attached subprogram.
const Function & getFunction() const
Definition Function.h:164
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:598
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:77
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:328
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:93
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:161
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:669
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1317
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:150
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A class that wrap the SHA1 algorithm.
Definition SHA1.h:27
LLVM_ABI void update(ArrayRef< uint8_t > Data)
Digest more data.
Definition SHA1.cpp:208
LLVM_ABI std::array< uint8_t, 20 > result()
Return the current raw 160-bits SHA1 for the digested data since the last call to init().
Definition SHA1.cpp:288
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:695
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
uint64_t read64le(const void *P)
Definition Endian.h:435
void write32le(void *P, uint32_t V)
Definition Endian.h:475
This is an optimization pass for GlobalISel generic memory operations.
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:65
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2079
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
constexpr bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1759
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2503
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
constexpr from_range_t from_range
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
cl::opt< unsigned > MemProfTopNImportant("memprof-top-n-important", cl::init(10), cl::Hidden, cl::desc("Number of largest cold contexts to consider important"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2157
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
cl::opt< unsigned > MaxSummaryIndirectEdges("module-summary-max-indirect-edges", cl::init(0), cl::Hidden, cl::desc("Max number of summary edges added from " "indirect call profile metadata"))
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
constexpr bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1748
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1779
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1918
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
cl::opt< bool > MemProfFixupImportant("memprof-fixup-important", cl::init(true), cl::Hidden, cl::desc("Enables edge fixup for important contexts"))
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34