LLVM 22.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
44#include "llvm/Support/SHA1.h"
46#include "llvm/Transforms/IPO.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110STATISTIC(NumImportantContextIds, "Number of important context ids");
111STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
112STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
113STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
114
116 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
117 cl::value_desc("filename"),
118 cl::desc("Specify the path prefix of the MemProf dot files."));
119
120static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
122 cl::desc("Export graph to dot files."));
123
124// TODO: Remove this option once new handling is validated more widely.
126 "memprof-merge-iteration", cl::init(true), cl::Hidden,
127 cl::desc("Iteratively apply merging on a node to catch new callers"));
128
129// How much of the graph to export to dot.
131 All, // The full CCG graph.
132 Alloc, // Only contexts for the specified allocation.
133 Context, // Only the specified context.
134};
135
137 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
140 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
142 "Export only nodes with contexts feeding given "
143 "-memprof-dot-alloc-id"),
144 clEnumValN(DotScope::Context, "context",
145 "Export only nodes with given -memprof-dot-context-id")));
146
148 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
149 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
150 "or to highlight if -memprof-dot-scope=all"));
151
153 "memprof-dot-context-id", cl::init(0), cl::Hidden,
154 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
155 "highlight otherwise"));
156
157static cl::opt<bool>
158 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
159 cl::desc("Dump CallingContextGraph to stdout after each stage."));
160
161static cl::opt<bool>
162 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
163 cl::desc("Perform verification checks on CallingContextGraph."));
164
165static cl::opt<bool>
166 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
167 cl::desc("Perform frequent verification checks on nodes."));
168
170 "memprof-import-summary",
171 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
172 cl::Hidden);
173
175 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
177 cl::desc("Max depth to recursively search for missing "
178 "frames through tail calls."));
179
180// Optionally enable cloning of callsites involved with recursive cycles
182 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
183 cl::desc("Allow cloning of callsites involved in recursive cycles"));
184
186 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
187 cl::desc("Allow cloning of contexts through recursive cycles"));
188
189// Generally this is needed for correct assignment of allocation clones to
190// function clones, however, allow it to be disabled for debugging while the
191// functionality is new and being tested more widely.
192static cl::opt<bool>
193 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
194 cl::desc("Merge clones before assigning functions"));
195
196// When disabled, try to detect and prevent cloning of recursive contexts.
197// This is only necessary until we support cloning through recursive cycles.
198// Leave on by default for now, as disabling requires a little bit of compile
199// time overhead and doesn't affect correctness, it will just inflate the cold
200// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
202 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
203 cl::desc("Allow cloning of contexts having recursive cycles"));
204
205// Set the minimum absolute count threshold for allowing inlining of indirect
206// calls promoted during cloning.
208 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
209 cl::desc("Minimum absolute count for promoted target to be inlinable"));
210
211namespace llvm {
213 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
214 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
215
216// Indicate we are linking with an allocator that supports hot/cold operator
217// new interfaces.
219 "supports-hot-cold-new", cl::init(false), cl::Hidden,
220 cl::desc("Linking with hot/cold operator new interfaces"));
221
223 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
224 cl::desc(
225 "Require target function definition when promoting indirect calls"));
226
229
231 "memprof-top-n-important", cl::init(10), cl::Hidden,
232 cl::desc("Number of largest cold contexts to consider important"));
233
235 "memprof-fixup-important", cl::init(true), cl::Hidden,
236 cl::desc("Enables edge fixup for important contexts"));
237
238} // namespace llvm
239
240namespace {
241
242/// CRTP base for graphs built from either IR or ThinLTO summary index.
243///
244/// The graph represents the call contexts in all memprof metadata on allocation
245/// calls, with nodes for the allocations themselves, as well as for the calls
246/// in each context. The graph is initially built from the allocation memprof
247/// metadata (or summary) MIBs. It is then updated to match calls with callsite
248/// metadata onto the nodes, updating it to reflect any inlining performed on
249/// those calls.
250///
251/// Each MIB (representing an allocation's call context with allocation
252/// behavior) is assigned a unique context id during the graph build. The edges
253/// and nodes in the graph are decorated with the context ids they carry. This
254/// is used to correctly update the graph when cloning is performed so that we
255/// can uniquify the context for a single (possibly cloned) allocation.
256template <typename DerivedCCG, typename FuncTy, typename CallTy>
257class CallsiteContextGraph {
258public:
259 CallsiteContextGraph() = default;
260 CallsiteContextGraph(const CallsiteContextGraph &) = default;
261 CallsiteContextGraph(CallsiteContextGraph &&) = default;
262
263 /// Main entry point to perform analysis and transformations on graph.
264 bool process();
265
266 /// Perform cloning on the graph necessary to uniquely identify the allocation
267 /// behavior of an allocation based on its context.
268 void identifyClones();
269
270 /// Assign callsite clones to functions, cloning functions as needed to
271 /// accommodate the combinations of their callsite clones reached by callers.
272 /// For regular LTO this clones functions and callsites in the IR, but for
273 /// ThinLTO the cloning decisions are noted in the summaries and later applied
274 /// in applyImport.
275 bool assignFunctions();
276
277 void dump() const;
278 void print(raw_ostream &OS) const;
279 void printTotalSizes(raw_ostream &OS) const;
280
282 const CallsiteContextGraph &CCG) {
283 CCG.print(OS);
284 return OS;
285 }
286
287 friend struct GraphTraits<
288 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
289 friend struct DOTGraphTraits<
290 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
291
292 void exportToDot(std::string Label) const;
293
294 /// Represents a function clone via FuncTy pointer and clone number pair.
295 struct FuncInfo final
296 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
297 using Base = std::pair<FuncTy *, unsigned>;
298 FuncInfo(const Base &B) : Base(B) {}
299 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
300 explicit operator bool() const { return this->first != nullptr; }
301 FuncTy *func() const { return this->first; }
302 unsigned cloneNo() const { return this->second; }
303 };
304
305 /// Represents a callsite clone via CallTy and clone number pair.
306 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
307 using Base = std::pair<CallTy, unsigned>;
308 CallInfo(const Base &B) : Base(B) {}
309 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
310 : Base(Call, CloneNo) {}
311 explicit operator bool() const { return (bool)this->first; }
312 CallTy call() const { return this->first; }
313 unsigned cloneNo() const { return this->second; }
314 void setCloneNo(unsigned N) { this->second = N; }
315 void print(raw_ostream &OS) const {
316 if (!operator bool()) {
317 assert(!cloneNo());
318 OS << "null Call";
319 return;
320 }
321 call()->print(OS);
322 OS << "\t(clone " << cloneNo() << ")";
323 }
324 void dump() const {
325 print(dbgs());
326 dbgs() << "\n";
327 }
328 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
329 Call.print(OS);
330 return OS;
331 }
332 };
333
334 struct ContextEdge;
335
336 /// Node in the Callsite Context Graph
337 struct ContextNode {
338 // Assigned to nodes as they are created, useful for debugging.
339 unsigned NodeId = 0;
340
341 // Keep this for now since in the IR case where we have an Instruction* it
342 // is not as immediately discoverable. Used for printing richer information
343 // when dumping graph.
344 bool IsAllocation;
345
346 // Keeps track of when the Call was reset to null because there was
347 // recursion.
348 bool Recursive = false;
349
350 // This will be formed by ORing together the AllocationType enum values
351 // for contexts including this node.
352 uint8_t AllocTypes = 0;
353
354 // The corresponding allocation or interior call. This is the primary call
355 // for which we have created this node.
356 CallInfo Call;
357
358 // List of other calls that can be treated the same as the primary call
359 // through cloning. I.e. located in the same function and have the same
360 // (possibly pruned) stack ids. They will be updated the same way as the
361 // primary call when assigning to function clones.
362 SmallVector<CallInfo, 0> MatchingCalls;
363
364 // For alloc nodes this is a unique id assigned when constructed, and for
365 // callsite stack nodes it is the original stack id when the node is
366 // constructed from the memprof MIB metadata on the alloc nodes. Note that
367 // this is only used when matching callsite metadata onto the stack nodes
368 // created when processing the allocation memprof MIBs, and for labeling
369 // nodes in the dot graph. Therefore we don't bother to assign a value for
370 // clones.
371 uint64_t OrigStackOrAllocId = 0;
372
373 // Edges to all callees in the profiled call stacks.
374 // TODO: Should this be a map (from Callee node) for more efficient lookup?
375 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
376
377 // Edges to all callers in the profiled call stacks.
378 // TODO: Should this be a map (from Caller node) for more efficient lookup?
379 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
380
381 // Returns true if we need to look at the callee edges for determining the
382 // node context ids and allocation type.
383 bool useCallerEdgesForContextInfo() const {
384 // Typically if the callee edges are empty either the caller edges are
385 // also empty, or this is an allocation (leaf node). However, if we are
386 // allowing recursive callsites and contexts this will be violated for
387 // incompletely cloned recursive cycles.
388 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
390 // When cloning for a recursive context, during cloning we might be in the
391 // midst of cloning for a recurrence and have moved context ids off of a
392 // caller edge onto the clone but not yet off of the incoming caller
393 // (back) edge. If we don't look at those we miss the fact that this node
394 // still has context ids of interest.
395 return IsAllocation || CloneRecursiveContexts;
396 }
397
398 // Compute the context ids for this node from the union of its edge context
399 // ids.
400 DenseSet<uint32_t> getContextIds() const {
401 unsigned Count = 0;
402 // Compute the number of ids for reserve below. In general we only need to
403 // look at one set of edges, typically the callee edges, since other than
404 // allocations and in some cases during recursion cloning, all the context
405 // ids on the callers should also flow out via callee edges.
406 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
407 Count += Edge->getContextIds().size();
408 DenseSet<uint32_t> ContextIds;
409 ContextIds.reserve(Count);
411 CalleeEdges, useCallerEdgesForContextInfo()
412 ? CallerEdges
413 : std::vector<std::shared_ptr<ContextEdge>>());
414 for (const auto &Edge : Edges)
415 ContextIds.insert_range(Edge->getContextIds());
416 return ContextIds;
417 }
418
419 // Compute the allocation type for this node from the OR of its edge
420 // allocation types.
421 uint8_t computeAllocType() const {
422 uint8_t BothTypes =
423 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
424 uint8_t AllocType = (uint8_t)AllocationType::None;
426 CalleeEdges, useCallerEdgesForContextInfo()
427 ? CallerEdges
428 : std::vector<std::shared_ptr<ContextEdge>>());
429 for (const auto &Edge : Edges) {
430 AllocType |= Edge->AllocTypes;
431 // Bail early if alloc type reached both, no further refinement.
432 if (AllocType == BothTypes)
433 return AllocType;
434 }
435 return AllocType;
436 }
437
438 // The context ids set for this node is empty if its edge context ids are
439 // also all empty.
440 bool emptyContextIds() const {
442 CalleeEdges, useCallerEdgesForContextInfo()
443 ? CallerEdges
444 : std::vector<std::shared_ptr<ContextEdge>>());
445 for (const auto &Edge : Edges) {
446 if (!Edge->getContextIds().empty())
447 return false;
448 }
449 return true;
450 }
451
452 // List of clones of this ContextNode, initially empty.
453 std::vector<ContextNode *> Clones;
454
455 // If a clone, points to the original uncloned node.
456 ContextNode *CloneOf = nullptr;
457
458 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
459
460 ContextNode(bool IsAllocation, CallInfo C)
461 : IsAllocation(IsAllocation), Call(C) {}
462
463 void addClone(ContextNode *Clone) {
464 if (CloneOf) {
465 CloneOf->Clones.push_back(Clone);
466 Clone->CloneOf = CloneOf;
467 } else {
468 Clones.push_back(Clone);
469 assert(!Clone->CloneOf);
470 Clone->CloneOf = this;
471 }
472 }
473
474 ContextNode *getOrigNode() {
475 if (!CloneOf)
476 return this;
477 return CloneOf;
478 }
479
480 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
481 unsigned int ContextId);
482
483 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
484 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
485 void eraseCalleeEdge(const ContextEdge *Edge);
486 void eraseCallerEdge(const ContextEdge *Edge);
487
488 void setCall(CallInfo C) { Call = C; }
489
490 bool hasCall() const { return (bool)Call.call(); }
491
492 void printCall(raw_ostream &OS) const { Call.print(OS); }
493
494 // True if this node was effectively removed from the graph, in which case
495 // it should have an allocation type of None and empty context ids.
496 bool isRemoved() const {
497 // Typically if the callee edges are empty either the caller edges are
498 // also empty, or this is an allocation (leaf node). However, if we are
499 // allowing recursive callsites and contexts this will be violated for
500 // incompletely cloned recursive cycles.
502 (AllocTypes == (uint8_t)AllocationType::None) ==
503 emptyContextIds());
504 return AllocTypes == (uint8_t)AllocationType::None;
505 }
506
507 void dump() const;
508 void print(raw_ostream &OS) const;
509
510 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
511 Node.print(OS);
512 return OS;
513 }
514 };
515
516 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
517 /// callee.
518 struct ContextEdge {
519 ContextNode *Callee;
520 ContextNode *Caller;
521
522 // This will be formed by ORing together the AllocationType enum values
523 // for contexts including this edge.
524 uint8_t AllocTypes = 0;
525
526 // Set just before initiating cloning when cloning of recursive contexts is
527 // enabled. Used to defer cloning of backedges until we have done cloning of
528 // the callee node for non-backedge caller edges. This exposes cloning
529 // opportunities through the backedge of the cycle.
530 // TODO: Note that this is not updated during cloning, and it is unclear
531 // whether that would be needed.
532 bool IsBackedge = false;
533
534 // The set of IDs for contexts including this edge.
535 DenseSet<uint32_t> ContextIds;
536
537 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
538 DenseSet<uint32_t> ContextIds)
539 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
540 ContextIds(std::move(ContextIds)) {}
541
542 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
543
544 // Helper to clear the fields of this edge when we are removing it from the
545 // graph.
546 inline void clear() {
547 ContextIds.clear();
548 AllocTypes = (uint8_t)AllocationType::None;
549 Caller = nullptr;
550 Callee = nullptr;
551 }
552
553 // Check if edge was removed from the graph. This is useful while iterating
554 // over a copy of edge lists when performing operations that mutate the
555 // graph in ways that might remove one of the edges.
556 inline bool isRemoved() const {
557 if (Callee || Caller)
558 return false;
559 // Any edges that have been removed from the graph but are still in a
560 // shared_ptr somewhere should have all fields null'ed out by clear()
561 // above.
562 assert(AllocTypes == (uint8_t)AllocationType::None);
563 assert(ContextIds.empty());
564 return true;
565 }
566
567 void dump() const;
568 void print(raw_ostream &OS) const;
569
570 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
571 Edge.print(OS);
572 return OS;
573 }
574 };
575
576 /// Helpers to remove edges that have allocation type None (due to not
577 /// carrying any context ids) after transformations.
578 void removeNoneTypeCalleeEdges(ContextNode *Node);
579 void removeNoneTypeCallerEdges(ContextNode *Node);
580 void
581 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
582 DenseSet<const ContextNode *> &Visited);
583
584protected:
585 /// Get a list of nodes corresponding to the stack ids in the given callsite
586 /// context.
587 template <class NodeT, class IteratorT>
588 std::vector<uint64_t>
589 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
590
591 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
592 /// metadata (or summary).
593 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
594
595 /// Adds nodes for the given MIB stack ids.
596 template <class NodeT, class IteratorT>
597 void addStackNodesForMIB(
598 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
599 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
600 ArrayRef<ContextTotalSize> ContextSizeInfo,
601 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
602
603 /// Matches all callsite metadata (or summary) to the nodes created for
604 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
605 /// inlining performed on those callsite instructions.
606 void updateStackNodes();
607
608 /// Optionally fixup edges for the N largest cold contexts to better enable
609 /// cloning. This is particularly helpful if the context includes recursion
610 /// as well as inlining, resulting in a single stack node for multiple stack
611 /// ids in the context. With recursion it is particularly difficult to get the
612 /// edge updates correct as in the general case we have lost the original
613 /// stack id ordering for the context. Do more expensive fixup for the largest
614 /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
615 void fixupImportantContexts();
616
617 /// Update graph to conservatively handle any callsite stack nodes that target
618 /// multiple different callee target functions.
619 void handleCallsitesWithMultipleTargets();
620
621 /// Mark backedges via the standard DFS based backedge algorithm.
622 void markBackedges();
623
624 /// Merge clones generated during cloning for different allocations but that
625 /// are called by the same caller node, to ensure proper function assignment.
626 void mergeClones();
627
628 // Try to partition calls on the given node (already placed into the AllCalls
629 // array) by callee function, creating new copies of Node as needed to hold
630 // calls with different callees, and moving the callee edges appropriately.
631 // Returns true if partitioning was successful.
632 bool partitionCallsByCallee(
633 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
634 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
635
636 /// Save lists of calls with MemProf metadata in each function, for faster
637 /// iteration.
638 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
639
640 /// Map from callsite node to the enclosing caller function.
641 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
642
643 // When exporting to dot, and an allocation id is specified, contains the
644 // context ids on that allocation.
645 DenseSet<uint32_t> DotAllocContextIds;
646
647private:
648 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
649
650 // Structure to keep track of information for each call as we are matching
651 // non-allocation callsites onto context nodes created from the allocation
652 // call metadata / summary contexts.
653 struct CallContextInfo {
654 // The callsite we're trying to match.
655 CallTy Call;
656 // The callsites stack ids that have a context node in the graph.
657 std::vector<uint64_t> StackIds;
658 // The function containing this callsite.
659 const FuncTy *Func;
660 // Initially empty, if needed this will be updated to contain the context
661 // ids for use in a new context node created for this callsite.
662 DenseSet<uint32_t> ContextIds;
663 };
664
665 /// Helper to remove edge from graph, updating edge iterator if it is provided
666 /// (in which case CalleeIter indicates which edge list is being iterated).
667 /// This will also perform the necessary clearing of the ContextEdge members
668 /// to enable later checking if the edge has been removed (since we may have
669 /// other copies of the shared_ptr in existence, and in fact rely on this to
670 /// enable removal while iterating over a copy of a node's edge list).
671 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
672 bool CalleeIter = true);
673
674 /// Assigns the given Node to calls at or inlined into the location with
675 /// the Node's stack id, after post order traversing and processing its
676 /// caller nodes. Uses the call information recorded in the given
677 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
678 /// as needed. Called by updateStackNodes which sets up the given
679 /// StackIdToMatchingCalls map.
680 void assignStackNodesPostOrder(
681 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
682 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
683 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
684 const DenseSet<uint32_t> &ImportantContextIds);
685
686 /// Duplicates the given set of context ids, updating the provided
687 /// map from each original id with the newly generated context ids,
688 /// and returning the new duplicated id set.
689 DenseSet<uint32_t> duplicateContextIds(
690 const DenseSet<uint32_t> &StackSequenceContextIds,
691 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
692
693 /// Propagates all duplicated context ids across the graph.
694 void propagateDuplicateContextIds(
695 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
696
697 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
698 /// else to its callers. Also updates OrigNode's edges to remove any context
699 /// ids moved to the newly created edge.
700 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
701 bool TowardsCallee,
702 DenseSet<uint32_t> RemainingContextIds);
703
704 /// Get the stack id corresponding to the given Id or Index (for IR this will
705 /// return itself, for a summary index this will return the id recorded in the
706 /// index for that stack id index value).
707 uint64_t getStackId(uint64_t IdOrIndex) const {
708 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
709 }
710
711 /// Returns true if the given call targets the callee of the given edge, or if
712 /// we were able to identify the call chain through intermediate tail calls.
713 /// In the latter case new context nodes are added to the graph for the
714 /// identified tail calls, and their synthesized nodes are added to
715 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
716 /// the updated edges and to prepare it for an increment in the caller.
717 bool
718 calleesMatch(CallTy Call, EdgeIter &EI,
719 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
720
721 // Return the callee function of the given call, or nullptr if it can't be
722 // determined
723 const FuncTy *getCalleeFunc(CallTy Call) {
724 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
725 }
726
727 /// Returns true if the given call targets the given function, or if we were
728 /// able to identify the call chain through intermediate tail calls (in which
729 /// case FoundCalleeChain will be populated).
730 bool calleeMatchesFunc(
731 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
732 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
733 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
734 Call, Func, CallerFunc, FoundCalleeChain);
735 }
736
737 /// Returns true if both call instructions have the same callee.
738 bool sameCallee(CallTy Call1, CallTy Call2) {
739 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
740 }
741
742 /// Get a list of nodes corresponding to the stack ids in the given
743 /// callsite's context.
744 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
745 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
746 Call);
747 }
748
749 /// Get the last stack id in the context for callsite.
750 uint64_t getLastStackId(CallTy Call) {
751 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
752 }
753
754 /// Update the allocation call to record type of allocated memory.
755 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
756 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
757 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
758 }
759
760 /// Get the AllocationType assigned to the given allocation instruction clone.
761 AllocationType getAllocationCallType(const CallInfo &Call) const {
762 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
763 }
764
765 /// Update non-allocation call to invoke (possibly cloned) function
766 /// CalleeFunc.
767 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
768 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
769 }
770
771 /// Clone the given function for the given callsite, recording mapping of all
772 /// of the functions tracked calls to their new versions in the CallMap.
773 /// Assigns new clones to clone number CloneNo.
774 FuncInfo cloneFunctionForCallsite(
775 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
776 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
777 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
778 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
779 }
780
781 /// Gets a label to use in the dot graph for the given call clone in the given
782 /// function.
783 std::string getLabel(const FuncTy *Func, const CallTy Call,
784 unsigned CloneNo) const {
785 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
786 }
787
788 // Create and return a new ContextNode.
789 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
790 CallInfo C = CallInfo()) {
791 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
792 auto *NewNode = NodeOwner.back().get();
793 if (F)
794 NodeToCallingFunc[NewNode] = F;
795 NewNode->NodeId = NodeOwner.size();
796 return NewNode;
797 }
798
799 /// Helpers to find the node corresponding to the given call or stackid.
800 ContextNode *getNodeForInst(const CallInfo &C);
801 ContextNode *getNodeForAlloc(const CallInfo &C);
802 ContextNode *getNodeForStackId(uint64_t StackId);
803
804 /// Computes the alloc type corresponding to the given context ids, by
805 /// unioning their recorded alloc types.
806 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
807
808 /// Returns the allocation type of the intersection of the contexts of two
809 /// nodes (based on their provided context id sets), optimized for the case
810 /// when Node1Ids is smaller than Node2Ids.
811 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
812 const DenseSet<uint32_t> &Node2Ids) const;
813
814 /// Returns the allocation type of the intersection of the contexts of two
815 /// nodes (based on their provided context id sets).
816 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
817 const DenseSet<uint32_t> &Node2Ids) const;
818
819 /// Create a clone of Edge's callee and move Edge to that new callee node,
820 /// performing the necessary context id and allocation type updates.
821 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
822 /// moved to an edge to the new callee.
823 ContextNode *
824 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
825 DenseSet<uint32_t> ContextIdsToMove = {});
826
827 /// Change the callee of Edge to existing callee clone NewCallee, performing
828 /// the necessary context id and allocation type updates.
829 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
830 /// moved to an edge to the new callee.
831 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
832 ContextNode *NewCallee,
833 bool NewClone = false,
834 DenseSet<uint32_t> ContextIdsToMove = {});
835
836 /// Change the caller of the edge at the given callee edge iterator to be
837 /// NewCaller, performing the necessary context id and allocation type
838 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
839 /// a simplified version of it as we always move the given edge and all of its
840 /// context ids.
841 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
842 ContextNode *NewCaller);
843
844 /// Recursive helper for marking backedges via DFS.
845 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
846 DenseSet<const ContextNode *> &CurrentStack);
847
848 /// Recursive helper for merging clones.
849 void
850 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
851 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
852 /// Main worker for merging callee clones for a given node.
853 void mergeNodeCalleeClones(
854 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
855 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
856 /// Helper to find other callers of the given set of callee edges that can
857 /// share the same callee merge node.
858 void findOtherCallersToShareMerge(
859 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
860 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
861 DenseSet<ContextNode *> &OtherCallersToShareMerge);
862
863 /// Recursively perform cloning on the graph for the given Node and its
864 /// callers, in order to uniquely identify the allocation behavior of an
865 /// allocation given its context. The context ids of the allocation being
866 /// processed are given in AllocContextIds.
867 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
868 const DenseSet<uint32_t> &AllocContextIds);
869
870 /// Map from each context ID to the AllocationType assigned to that context.
871 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
872
873 /// Map from each contextID to the profiled full contexts and their total
874 /// sizes (there may be more than one due to context trimming),
875 /// optionally populated when requested (via MemProfReportHintedSizes or
876 /// MinClonedColdBytePercent).
877 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
878
879 /// Identifies the context node created for a stack id when adding the MIB
880 /// contexts to the graph. This is used to locate the context nodes when
881 /// trying to assign the corresponding callsites with those stack ids to these
882 /// nodes.
883 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
884
885 /// Saves information for the contexts identified as important (the largest
886 /// cold contexts up to MemProfTopNImportant).
887 struct ImportantContextInfo {
888 // The original list of leaf first stack ids corresponding to this context.
889 std::vector<uint64_t> StackIds;
890 // Max length of stack ids corresponding to a single stack ContextNode for
891 // this context (i.e. the max length of a key in StackIdsToNode below).
892 unsigned MaxLength = 0;
893 // Mapping of slices of the stack ids to the corresponding ContextNode
894 // (there can be multiple stack ids due to inlining). Populated when
895 // updating stack nodes while matching them to the IR or summary.
896 std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
897 };
898
899 // Map of important full context ids to information about each.
900 DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
901
902 // For each important context id found in Node (if any), records the list of
903 // stack ids that corresponded to the given callsite Node. There can be more
904 // than one in the case of inlining.
905 void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
906 // We pass in the Node's context ids to avoid the
907 // overhead of computing them as the caller already has
908 // them in some cases.
909 const DenseSet<uint32_t> &NodeContextIds,
910 const DenseSet<uint32_t> &ImportantContextIds) {
912 assert(ImportantContextIds.empty());
913 return;
914 }
915 DenseSet<uint32_t> Ids =
916 set_intersection(NodeContextIds, ImportantContextIds);
917 if (Ids.empty())
918 return;
919 auto Size = StackIds.size();
920 for (auto Id : Ids) {
921 auto &Entry = ImportantContextIdInfo[Id];
922 Entry.StackIdsToNode[StackIds] = Node;
923 // Keep track of the max to simplify later analysis.
924 if (Size > Entry.MaxLength)
925 Entry.MaxLength = Size;
926 }
927 }
928
929 /// Maps to track the calls to their corresponding nodes in the graph.
930 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
931 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
932
933 /// Owner of all ContextNode unique_ptrs.
934 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
935
936 /// Perform sanity checks on graph when requested.
937 void check() const;
938
939 /// Keeps track of the last unique context id assigned.
940 unsigned int LastContextId = 0;
941};
942
943template <typename DerivedCCG, typename FuncTy, typename CallTy>
944using ContextNode =
945 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
946template <typename DerivedCCG, typename FuncTy, typename CallTy>
947using ContextEdge =
948 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
949template <typename DerivedCCG, typename FuncTy, typename CallTy>
950using FuncInfo =
951 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
952template <typename DerivedCCG, typename FuncTy, typename CallTy>
953using CallInfo =
954 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
955
956/// CRTP derived class for graphs built from IR (regular LTO).
957class ModuleCallsiteContextGraph
958 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
959 Instruction *> {
960public:
961 ModuleCallsiteContextGraph(
962 Module &M,
963 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
964
965private:
966 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
967 Instruction *>;
968
969 uint64_t getStackId(uint64_t IdOrIndex) const;
970 const Function *getCalleeFunc(Instruction *Call);
971 bool calleeMatchesFunc(
972 Instruction *Call, const Function *Func, const Function *CallerFunc,
973 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
974 bool sameCallee(Instruction *Call1, Instruction *Call2);
975 bool findProfiledCalleeThroughTailCalls(
976 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
977 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
978 bool &FoundMultipleCalleeChains);
979 uint64_t getLastStackId(Instruction *Call);
980 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
981 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
982 AllocationType getAllocationCallType(const CallInfo &Call) const;
983 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
984 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
985 Instruction *>::FuncInfo
986 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
987 DenseMap<CallInfo, CallInfo> &CallMap,
988 std::vector<CallInfo> &CallsWithMetadataInFunc,
989 unsigned CloneNo);
990 std::string getLabel(const Function *Func, const Instruction *Call,
991 unsigned CloneNo) const;
992
993 const Module &Mod;
994 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
995};
996
997/// Represents a call in the summary index graph, which can either be an
998/// allocation or an interior callsite node in an allocation's context.
999/// Holds a pointer to the corresponding data structure in the index.
1000struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
1001 IndexCall() : PointerUnion() {}
1002 IndexCall(std::nullptr_t) : IndexCall() {}
1003 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
1004 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
1005 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
1006
1007 IndexCall *operator->() { return this; }
1008
1009 void print(raw_ostream &OS) const {
1010 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
1012 OS << *AI;
1013 } else {
1015 assert(CI);
1016 OS << *CI;
1017 }
1018 }
1019};
1020} // namespace
1021
1022namespace llvm {
1023template <> struct simplify_type<IndexCall> {
1025 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
1026};
1027template <> struct simplify_type<const IndexCall> {
1029 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
1030};
1031} // namespace llvm
1032
1033namespace {
1034/// CRTP derived class for graphs built from summary index (ThinLTO).
1035class IndexCallsiteContextGraph
1036 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1037 IndexCall> {
1038public:
1039 IndexCallsiteContextGraph(
1040 ModuleSummaryIndex &Index,
1041 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1042 isPrevailing);
1043
1044 ~IndexCallsiteContextGraph() {
1045 // Now that we are done with the graph it is safe to add the new
1046 // CallsiteInfo structs to the function summary vectors. The graph nodes
1047 // point into locations within these vectors, so we don't want to add them
1048 // any earlier.
1049 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
1050 auto *FS = I.first;
1051 for (auto &Callsite : I.second)
1052 FS->addCallsite(*Callsite.second);
1053 }
1054 }
1055
1056private:
1057 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1058 IndexCall>;
1059
1060 uint64_t getStackId(uint64_t IdOrIndex) const;
1061 const FunctionSummary *getCalleeFunc(IndexCall &Call);
1062 bool calleeMatchesFunc(
1063 IndexCall &Call, const FunctionSummary *Func,
1064 const FunctionSummary *CallerFunc,
1065 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
1066 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1067 bool findProfiledCalleeThroughTailCalls(
1068 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1069 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1070 bool &FoundMultipleCalleeChains);
1071 uint64_t getLastStackId(IndexCall &Call);
1072 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1073 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1074 AllocationType getAllocationCallType(const CallInfo &Call) const;
1075 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1076 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1077 IndexCall>::FuncInfo
1078 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1079 DenseMap<CallInfo, CallInfo> &CallMap,
1080 std::vector<CallInfo> &CallsWithMetadataInFunc,
1081 unsigned CloneNo);
1082 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1083 unsigned CloneNo) const;
1084
1085 // Saves mapping from function summaries containing memprof records back to
1086 // its VI, for use in checking and debugging.
1087 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1088
1089 const ModuleSummaryIndex &Index;
1090 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1091 isPrevailing;
1092
1093 // Saves/owns the callsite info structures synthesized for missing tail call
1094 // frames that we discover while building the graph.
1095 // It maps from the summary of the function making the tail call, to a map
1096 // of callee ValueInfo to corresponding synthesized callsite info.
1097 std::unordered_map<FunctionSummary *,
1098 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1099 FunctionCalleesToSynthesizedCallsiteInfos;
1100};
1101} // namespace
1102
1103template <>
1104struct llvm::DenseMapInfo<CallsiteContextGraph<
1105 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1107template <>
1108struct llvm::DenseMapInfo<CallsiteContextGraph<
1109 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1110 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1111template <>
1112struct llvm::DenseMapInfo<IndexCall>
1113 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1114
1115namespace {
1116
1117// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1118// type we should actually use on the corresponding allocation.
1119// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1120// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1121// from NotCold.
1122AllocationType allocTypeToUse(uint8_t AllocTypes) {
1123 assert(AllocTypes != (uint8_t)AllocationType::None);
1124 if (AllocTypes ==
1127 else
1128 return (AllocationType)AllocTypes;
1129}
1130
1131// Helper to check if the alloc types for all edges recorded in the
1132// InAllocTypes vector match the alloc types for all edges in the Edges
1133// vector.
1134template <typename DerivedCCG, typename FuncTy, typename CallTy>
1135bool allocTypesMatch(
1136 const std::vector<uint8_t> &InAllocTypes,
1137 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1138 &Edges) {
1139 // This should be called only when the InAllocTypes vector was computed for
1140 // this set of Edges. Make sure the sizes are the same.
1141 assert(InAllocTypes.size() == Edges.size());
1142 return std::equal(
1143 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1144 [](const uint8_t &l,
1145 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1146 // Can share if one of the edges is None type - don't
1147 // care about the type along that edge as it doesn't
1148 // exist for those context ids.
1149 if (l == (uint8_t)AllocationType::None ||
1150 r->AllocTypes == (uint8_t)AllocationType::None)
1151 return true;
1152 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1153 });
1154}
1155
1156// Helper to check if the alloc types for all edges recorded in the
1157// InAllocTypes vector match the alloc types for callee edges in the given
1158// clone. Because the InAllocTypes were computed from the original node's callee
1159// edges, and other cloning could have happened after this clone was created, we
1160// need to find the matching clone callee edge, which may or may not exist.
1161template <typename DerivedCCG, typename FuncTy, typename CallTy>
1162bool allocTypesMatchClone(
1163 const std::vector<uint8_t> &InAllocTypes,
1164 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1165 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1166 assert(Node);
1167 // InAllocTypes should have been computed for the original node's callee
1168 // edges.
1169 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1170 // First create a map of the clone callee edge callees to the edge alloc type.
1172 EdgeCalleeMap;
1173 for (const auto &E : Clone->CalleeEdges) {
1174 assert(!EdgeCalleeMap.contains(E->Callee));
1175 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1176 }
1177 // Next, walk the original node's callees, and look for the corresponding
1178 // clone edge to that callee.
1179 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1180 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1181 // Not found is ok, we will simply add an edge if we use this clone.
1182 if (Iter == EdgeCalleeMap.end())
1183 continue;
1184 // Can share if one of the edges is None type - don't
1185 // care about the type along that edge as it doesn't
1186 // exist for those context ids.
1187 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1188 Iter->second == (uint8_t)AllocationType::None)
1189 continue;
1190 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1191 return false;
1192 }
1193 return true;
1194}
1195
1196} // end anonymous namespace
1197
1198template <typename DerivedCCG, typename FuncTy, typename CallTy>
1199typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1200CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1201 const CallInfo &C) {
1202 ContextNode *Node = getNodeForAlloc(C);
1203 if (Node)
1204 return Node;
1205
1206 return NonAllocationCallToContextNodeMap.lookup(C);
1207}
1208
1209template <typename DerivedCCG, typename FuncTy, typename CallTy>
1210typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1211CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1212 const CallInfo &C) {
1213 return AllocationCallToContextNodeMap.lookup(C);
1214}
1215
1216template <typename DerivedCCG, typename FuncTy, typename CallTy>
1217typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1218CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1219 uint64_t StackId) {
1220 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1221 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1222 return StackEntryNode->second;
1223 return nullptr;
1224}
1225
1226template <typename DerivedCCG, typename FuncTy, typename CallTy>
1227void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1228 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1229 unsigned int ContextId) {
1230 for (auto &Edge : CallerEdges) {
1231 if (Edge->Caller == Caller) {
1232 Edge->AllocTypes |= (uint8_t)AllocType;
1233 Edge->getContextIds().insert(ContextId);
1234 return;
1235 }
1236 }
1237 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1238 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1239 CallerEdges.push_back(Edge);
1240 Caller->CalleeEdges.push_back(Edge);
1241}
1242
1243template <typename DerivedCCG, typename FuncTy, typename CallTy>
1244void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1245 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1246 assert(!EI || (*EI)->get() == Edge);
1247 assert(!Edge->isRemoved());
1248 // Save the Caller and Callee pointers so we can erase Edge from their edge
1249 // lists after clearing Edge below. We do the clearing first in case it is
1250 // destructed after removing from the edge lists (if those were the last
1251 // shared_ptr references to Edge).
1252 auto *Callee = Edge->Callee;
1253 auto *Caller = Edge->Caller;
1254
1255 // Make sure the edge fields are cleared out so we can properly detect
1256 // removed edges if Edge is not destructed because there is still a shared_ptr
1257 // reference.
1258 Edge->clear();
1259
1260#ifndef NDEBUG
1261 auto CalleeCallerCount = Callee->CallerEdges.size();
1262 auto CallerCalleeCount = Caller->CalleeEdges.size();
1263#endif
1264 if (!EI) {
1265 Callee->eraseCallerEdge(Edge);
1266 Caller->eraseCalleeEdge(Edge);
1267 } else if (CalleeIter) {
1268 Callee->eraseCallerEdge(Edge);
1269 *EI = Caller->CalleeEdges.erase(*EI);
1270 } else {
1271 Caller->eraseCalleeEdge(Edge);
1272 *EI = Callee->CallerEdges.erase(*EI);
1273 }
1274 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1275 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1276}
1277
1278template <typename DerivedCCG, typename FuncTy, typename CallTy>
1279void CallsiteContextGraph<
1280 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1281 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1282 auto Edge = *EI;
1283 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1284 assert(Edge->ContextIds.empty());
1285 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1286 } else
1287 ++EI;
1288 }
1289}
1290
1291template <typename DerivedCCG, typename FuncTy, typename CallTy>
1292void CallsiteContextGraph<
1293 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1294 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1295 auto Edge = *EI;
1296 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1297 assert(Edge->ContextIds.empty());
1298 Edge->Caller->eraseCalleeEdge(Edge.get());
1299 EI = Node->CallerEdges.erase(EI);
1300 } else
1301 ++EI;
1302 }
1303}
1304
1305template <typename DerivedCCG, typename FuncTy, typename CallTy>
1306typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1307CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1308 findEdgeFromCallee(const ContextNode *Callee) {
1309 for (const auto &Edge : CalleeEdges)
1310 if (Edge->Callee == Callee)
1311 return Edge.get();
1312 return nullptr;
1313}
1314
1315template <typename DerivedCCG, typename FuncTy, typename CallTy>
1316typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1317CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1318 findEdgeFromCaller(const ContextNode *Caller) {
1319 for (const auto &Edge : CallerEdges)
1320 if (Edge->Caller == Caller)
1321 return Edge.get();
1322 return nullptr;
1323}
1324
1325template <typename DerivedCCG, typename FuncTy, typename CallTy>
1326void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1327 eraseCalleeEdge(const ContextEdge *Edge) {
1328 auto EI = llvm::find_if(
1329 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1330 return CalleeEdge.get() == Edge;
1331 });
1332 assert(EI != CalleeEdges.end());
1333 CalleeEdges.erase(EI);
1334}
1335
1336template <typename DerivedCCG, typename FuncTy, typename CallTy>
1337void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1338 eraseCallerEdge(const ContextEdge *Edge) {
1339 auto EI = llvm::find_if(
1340 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1341 return CallerEdge.get() == Edge;
1342 });
1343 assert(EI != CallerEdges.end());
1344 CallerEdges.erase(EI);
1345}
1346
1347template <typename DerivedCCG, typename FuncTy, typename CallTy>
1348uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1349 DenseSet<uint32_t> &ContextIds) const {
1350 uint8_t BothTypes =
1351 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1352 uint8_t AllocType = (uint8_t)AllocationType::None;
1353 for (auto Id : ContextIds) {
1354 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1355 // Bail early if alloc type reached both, no further refinement.
1356 if (AllocType == BothTypes)
1357 return AllocType;
1358 }
1359 return AllocType;
1360}
1361
1362template <typename DerivedCCG, typename FuncTy, typename CallTy>
1363uint8_t
1364CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1365 const DenseSet<uint32_t> &Node1Ids,
1366 const DenseSet<uint32_t> &Node2Ids) const {
1367 uint8_t BothTypes =
1368 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1369 uint8_t AllocType = (uint8_t)AllocationType::None;
1370 for (auto Id : Node1Ids) {
1371 if (!Node2Ids.count(Id))
1372 continue;
1373 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1374 // Bail early if alloc type reached both, no further refinement.
1375 if (AllocType == BothTypes)
1376 return AllocType;
1377 }
1378 return AllocType;
1379}
1380
1381template <typename DerivedCCG, typename FuncTy, typename CallTy>
1382uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1383 const DenseSet<uint32_t> &Node1Ids,
1384 const DenseSet<uint32_t> &Node2Ids) const {
1385 if (Node1Ids.size() < Node2Ids.size())
1386 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1387 else
1388 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1389}
1390
1391template <typename DerivedCCG, typename FuncTy, typename CallTy>
1392typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1393CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1394 CallInfo Call, const FuncTy *F) {
1395 assert(!getNodeForAlloc(Call));
1396 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1397 AllocationCallToContextNodeMap[Call] = AllocNode;
1398 // Use LastContextId as a uniq id for MIB allocation nodes.
1399 AllocNode->OrigStackOrAllocId = LastContextId;
1400 // Alloc type should be updated as we add in the MIBs. We should assert
1401 // afterwards that it is not still None.
1402 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1403
1404 return AllocNode;
1405}
1406
1407static std::string getAllocTypeString(uint8_t AllocTypes) {
1408 if (!AllocTypes)
1409 return "None";
1410 std::string Str;
1411 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1412 Str += "NotCold";
1413 if (AllocTypes & (uint8_t)AllocationType::Cold)
1414 Str += "Cold";
1415 return Str;
1416}
1417
1418template <typename DerivedCCG, typename FuncTy, typename CallTy>
1419template <class NodeT, class IteratorT>
1420void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1421 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1422 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1423 ArrayRef<ContextTotalSize> ContextSizeInfo,
1424 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
1425 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1426 // is done.
1427 if (AllocType == AllocationType::Hot)
1428 AllocType = AllocationType::NotCold;
1429
1430 ContextIdToAllocationType[++LastContextId] = AllocType;
1431
1432 bool IsImportant = false;
1433 if (!ContextSizeInfo.empty()) {
1434 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1435 // If this is a cold allocation, and we are collecting non-zero largest
1436 // contexts, see if this is a candidate.
1437 if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
1438 uint64_t TotalCold = 0;
1439 for (auto &CSI : ContextSizeInfo)
1440 TotalCold += CSI.TotalSize;
1441 // Record this context if either we haven't found the first top-n largest
1442 // yet, or if it is larger than the smallest already recorded.
1443 if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
1444 // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
1445 // sorted in ascending size of its key which is the size.
1446 TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
1447 if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
1448 // Remove old one and its associated entries.
1449 auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
1450 TotalSizeToContextIdTopNCold.erase(
1451 TotalSizeToContextIdTopNCold.begin());
1452 assert(ImportantContextIdInfo.count(IdToRemove));
1453 ImportantContextIdInfo.erase(IdToRemove);
1454 }
1455 TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
1456 IsImportant = true;
1457 }
1458 }
1459 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1460 }
1461
1462 // Update alloc type and context ids for this MIB.
1463 AllocNode->AllocTypes |= (uint8_t)AllocType;
1464
1465 // Now add or update nodes for each stack id in alloc's context.
1466 // Later when processing the stack ids on non-alloc callsites we will adjust
1467 // for any inlining in the context.
1468 ContextNode *PrevNode = AllocNode;
1469 // Look for recursion (direct recursion should have been collapsed by
1470 // module summary analysis, here we should just be detecting mutual
1471 // recursion). Mark these nodes so we don't try to clone.
1472 SmallSet<uint64_t, 8> StackIdSet;
1473 // Skip any on the allocation call (inlining).
1474 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1475 ContextIter != StackContext.end(); ++ContextIter) {
1476 auto StackId = getStackId(*ContextIter);
1477 if (IsImportant)
1478 ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
1479 ContextNode *StackNode = getNodeForStackId(StackId);
1480 if (!StackNode) {
1481 StackNode = createNewNode(/*IsAllocation=*/false);
1482 StackEntryIdToContextNodeMap[StackId] = StackNode;
1483 StackNode->OrigStackOrAllocId = StackId;
1484 }
1485 // Marking a node recursive will prevent its cloning completely, even for
1486 // non-recursive contexts flowing through it.
1488 auto Ins = StackIdSet.insert(StackId);
1489 if (!Ins.second)
1490 StackNode->Recursive = true;
1491 }
1492 StackNode->AllocTypes |= (uint8_t)AllocType;
1493 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1494 PrevNode = StackNode;
1495 }
1496}
1497
1498template <typename DerivedCCG, typename FuncTy, typename CallTy>
1499DenseSet<uint32_t>
1500CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1501 const DenseSet<uint32_t> &StackSequenceContextIds,
1502 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1503 DenseSet<uint32_t> NewContextIds;
1504 for (auto OldId : StackSequenceContextIds) {
1505 NewContextIds.insert(++LastContextId);
1506 OldToNewContextIds[OldId].insert(LastContextId);
1507 assert(ContextIdToAllocationType.count(OldId));
1508 // The new context has the same allocation type as original.
1509 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1510 if (DotAllocContextIds.contains(OldId))
1511 DotAllocContextIds.insert(LastContextId);
1512 }
1513 return NewContextIds;
1514}
1515
1516template <typename DerivedCCG, typename FuncTy, typename CallTy>
1517void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1518 propagateDuplicateContextIds(
1519 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1520 // Build a set of duplicated context ids corresponding to the input id set.
1521 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1522 DenseSet<uint32_t> NewIds;
1523 for (auto Id : ContextIds)
1524 if (auto NewId = OldToNewContextIds.find(Id);
1525 NewId != OldToNewContextIds.end())
1526 NewIds.insert_range(NewId->second);
1527 return NewIds;
1528 };
1529
1530 // Recursively update context ids sets along caller edges.
1531 auto UpdateCallers = [&](ContextNode *Node,
1532 DenseSet<const ContextEdge *> &Visited,
1533 auto &&UpdateCallers) -> void {
1534 for (const auto &Edge : Node->CallerEdges) {
1535 auto Inserted = Visited.insert(Edge.get());
1536 if (!Inserted.second)
1537 continue;
1538 ContextNode *NextNode = Edge->Caller;
1539 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1540 // Only need to recursively iterate to NextNode via this caller edge if
1541 // it resulted in any added ids to NextNode.
1542 if (!NewIdsToAdd.empty()) {
1543 Edge->getContextIds().insert_range(NewIdsToAdd);
1544 UpdateCallers(NextNode, Visited, UpdateCallers);
1545 }
1546 }
1547 };
1548
1549 DenseSet<const ContextEdge *> Visited;
1550 for (auto &Entry : AllocationCallToContextNodeMap) {
1551 auto *Node = Entry.second;
1552 UpdateCallers(Node, Visited, UpdateCallers);
1553 }
1554}
1555
1556template <typename DerivedCCG, typename FuncTy, typename CallTy>
1557void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1558 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1559 // This must be passed by value to make a copy since it will be adjusted
1560 // as ids are moved.
1561 DenseSet<uint32_t> RemainingContextIds) {
1562 auto &OrigEdges =
1563 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1564 DenseSet<uint32_t> RecursiveContextIds;
1565 DenseSet<uint32_t> AllCallerContextIds;
1567 // Identify which context ids are recursive which is needed to properly
1568 // update the RemainingContextIds set. The relevant recursive context ids
1569 // are those that are in multiple edges.
1570 for (auto &CE : OrigEdges) {
1571 AllCallerContextIds.reserve(CE->getContextIds().size());
1572 for (auto Id : CE->getContextIds())
1573 if (!AllCallerContextIds.insert(Id).second)
1574 RecursiveContextIds.insert(Id);
1575 }
1576 }
1577 // Increment iterator in loop so that we can remove edges as needed.
1578 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1579 auto Edge = *EI;
1580 DenseSet<uint32_t> NewEdgeContextIds;
1581 DenseSet<uint32_t> NotFoundContextIds;
1582 // Remove any matching context ids from Edge, return set that were found and
1583 // removed, these are the new edge's context ids. Also update the remaining
1584 // (not found ids).
1585 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1586 NotFoundContextIds);
1587 // Update the remaining context ids set for the later edges. This is a
1588 // compile time optimization.
1589 if (RecursiveContextIds.empty()) {
1590 // No recursive ids, so all of the previously remaining context ids that
1591 // were not seen on this edge are the new remaining set.
1592 RemainingContextIds.swap(NotFoundContextIds);
1593 } else {
1594 // Keep the recursive ids in the remaining set as we expect to see those
1595 // on another edge. We can remove the non-recursive remaining ids that
1596 // were seen on this edge, however. We already have the set of remaining
1597 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1598 // non-recursive and only remove those. Note that despite the higher
1599 // overhead of updating the remaining context ids set when recursion
1600 // handling is enabled, it was found to be at worst performance neutral
1601 // and in one case a clear win.
1602 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1603 set_difference(NewEdgeContextIds, RecursiveContextIds);
1604 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1605 }
1606 // If no matching context ids for this edge, skip it.
1607 if (NewEdgeContextIds.empty()) {
1608 ++EI;
1609 continue;
1610 }
1611 if (TowardsCallee) {
1612 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1613 auto NewEdge = std::make_shared<ContextEdge>(
1614 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1615 NewNode->CalleeEdges.push_back(NewEdge);
1616 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1617 } else {
1618 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1619 auto NewEdge = std::make_shared<ContextEdge>(
1620 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1621 NewNode->CallerEdges.push_back(NewEdge);
1622 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1623 }
1624 // Remove old edge if context ids empty.
1625 if (Edge->getContextIds().empty()) {
1626 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1627 continue;
1628 }
1629 ++EI;
1630 }
1631}
1632
1633template <typename DerivedCCG, typename FuncTy, typename CallTy>
1634static void checkEdge(
1635 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1636 // Confirm that alloc type is not None and that we have at least one context
1637 // id.
1638 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1639 assert(!Edge->ContextIds.empty());
1640}
1641
1642template <typename DerivedCCG, typename FuncTy, typename CallTy>
1643static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1644 bool CheckEdges = true) {
1645 if (Node->isRemoved())
1646 return;
1647#ifndef NDEBUG
1648 // Compute node's context ids once for use in asserts.
1649 auto NodeContextIds = Node->getContextIds();
1650#endif
1651 // Node's context ids should be the union of both its callee and caller edge
1652 // context ids.
1653 if (Node->CallerEdges.size()) {
1654 DenseSet<uint32_t> CallerEdgeContextIds(
1655 Node->CallerEdges.front()->ContextIds);
1656 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1657 if (CheckEdges)
1659 set_union(CallerEdgeContextIds, Edge->ContextIds);
1660 }
1661 // Node can have more context ids than callers if some contexts terminate at
1662 // node and some are longer. If we are allowing recursive callsites and
1663 // contexts this will be violated for incompletely cloned recursive cycles,
1664 // so skip the checking in that case.
1666 NodeContextIds == CallerEdgeContextIds ||
1667 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1668 }
1669 if (Node->CalleeEdges.size()) {
1670 DenseSet<uint32_t> CalleeEdgeContextIds(
1671 Node->CalleeEdges.front()->ContextIds);
1672 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1673 if (CheckEdges)
1675 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1676 }
1677 // If we are allowing recursive callsites and contexts this will be violated
1678 // for incompletely cloned recursive cycles, so skip the checking in that
1679 // case.
1681 NodeContextIds == CalleeEdgeContextIds);
1682 }
1683 // FIXME: Since this checking is only invoked under an option, we should
1684 // change the error checking from using assert to something that will trigger
1685 // an error on a release build.
1686#ifndef NDEBUG
1687 // Make sure we don't end up with duplicate edges between the same caller and
1688 // callee.
1690 for (const auto &E : Node->CalleeEdges)
1691 NodeSet.insert(E->Callee);
1692 assert(NodeSet.size() == Node->CalleeEdges.size());
1693#endif
1694}
1695
1696template <typename DerivedCCG, typename FuncTy, typename CallTy>
1697void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1698 assignStackNodesPostOrder(ContextNode *Node,
1699 DenseSet<const ContextNode *> &Visited,
1700 DenseMap<uint64_t, std::vector<CallContextInfo>>
1701 &StackIdToMatchingCalls,
1702 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
1703 const DenseSet<uint32_t> &ImportantContextIds) {
1704 auto Inserted = Visited.insert(Node);
1705 if (!Inserted.second)
1706 return;
1707 // Post order traversal. Iterate over a copy since we may add nodes and
1708 // therefore new callers during the recursive call, invalidating any
1709 // iterator over the original edge vector. We don't need to process these
1710 // new nodes as they were already processed on creation.
1711 auto CallerEdges = Node->CallerEdges;
1712 for (auto &Edge : CallerEdges) {
1713 // Skip any that have been removed during the recursion.
1714 if (Edge->isRemoved()) {
1715 assert(!is_contained(Node->CallerEdges, Edge));
1716 continue;
1717 }
1718 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1719 CallToMatchingCall, ImportantContextIds);
1720 }
1721
1722 // If this node's stack id is in the map, update the graph to contain new
1723 // nodes representing any inlining at interior callsites. Note we move the
1724 // associated context ids over to the new nodes.
1725
1726 // Ignore this node if it is for an allocation or we didn't record any
1727 // stack id lists ending at it.
1728 if (Node->IsAllocation ||
1729 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1730 return;
1731
1732 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1733 // Handle the simple case first. A single call with a single stack id.
1734 // In this case there is no need to create any new context nodes, simply
1735 // assign the context node for stack id to this Call.
1736 if (Calls.size() == 1) {
1737 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1738 if (Ids.size() == 1) {
1739 assert(SavedContextIds.empty());
1740 // It should be this Node
1741 assert(Node == getNodeForStackId(Ids[0]));
1742 if (Node->Recursive)
1743 return;
1744 Node->setCall(Call);
1745 NonAllocationCallToContextNodeMap[Call] = Node;
1746 NodeToCallingFunc[Node] = Func;
1747 recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
1748 return;
1749 }
1750 }
1751
1752#ifndef NDEBUG
1753 // Find the node for the last stack id, which should be the same
1754 // across all calls recorded for this id, and is this node's id.
1755 uint64_t LastId = Node->OrigStackOrAllocId;
1756 ContextNode *LastNode = getNodeForStackId(LastId);
1757 // We should only have kept stack ids that had nodes.
1758 assert(LastNode);
1759 assert(LastNode == Node);
1760#else
1761 ContextNode *LastNode = Node;
1762#endif
1763
1764 // Compute the last node's context ids once, as it is shared by all calls in
1765 // this entry.
1766 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1767
1768 [[maybe_unused]] bool PrevIterCreatedNode = false;
1769 bool CreatedNode = false;
1770 for (unsigned I = 0; I < Calls.size();
1771 I++, PrevIterCreatedNode = CreatedNode) {
1772 CreatedNode = false;
1773 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1774 // Skip any for which we didn't assign any ids, these don't get a node in
1775 // the graph.
1776 if (SavedContextIds.empty()) {
1777 // If this call has a matching call (located in the same function and
1778 // having the same stack ids), simply add it to the context node created
1779 // for its matching call earlier. These can be treated the same through
1780 // cloning and get updated at the same time.
1781 if (!CallToMatchingCall.contains(Call))
1782 continue;
1783 auto MatchingCall = CallToMatchingCall[Call];
1784 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1785 // This should only happen if we had a prior iteration, and it didn't
1786 // create a node because of the below recomputation of context ids
1787 // finding none remaining and continuing early.
1788 assert(I > 0 && !PrevIterCreatedNode);
1789 continue;
1790 }
1791 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1792 Call);
1793 continue;
1794 }
1795
1796 assert(LastId == Ids.back());
1797
1798 // Recompute the context ids for this stack id sequence (the
1799 // intersection of the context ids of the corresponding nodes).
1800 // Start with the ids we saved in the map for this call, which could be
1801 // duplicated context ids. We have to recompute as we might have overlap
1802 // overlap between the saved context ids for different last nodes, and
1803 // removed them already during the post order traversal.
1804 set_intersect(SavedContextIds, LastNodeContextIds);
1805 ContextNode *PrevNode = LastNode;
1806 bool Skip = false;
1807 // Iterate backwards through the stack Ids, starting after the last Id
1808 // in the list, which was handled once outside for all Calls.
1809 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1810 auto Id = *IdIter;
1811 ContextNode *CurNode = getNodeForStackId(Id);
1812 // We should only have kept stack ids that had nodes and weren't
1813 // recursive.
1814 assert(CurNode);
1815 assert(!CurNode->Recursive);
1816
1817 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1818 if (!Edge) {
1819 Skip = true;
1820 break;
1821 }
1822 PrevNode = CurNode;
1823
1824 // Update the context ids, which is the intersection of the ids along
1825 // all edges in the sequence.
1826 set_intersect(SavedContextIds, Edge->getContextIds());
1827
1828 // If we now have no context ids for clone, skip this call.
1829 if (SavedContextIds.empty()) {
1830 Skip = true;
1831 break;
1832 }
1833 }
1834 if (Skip)
1835 continue;
1836
1837 // Create new context node.
1838 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1839 NonAllocationCallToContextNodeMap[Call] = NewNode;
1840 CreatedNode = true;
1841 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1842
1843 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1844 assert(FirstNode);
1845
1846 // Connect to callees of innermost stack frame in inlined call chain.
1847 // This updates context ids for FirstNode's callee's to reflect those
1848 // moved to NewNode.
1849 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1850
1851 // Connect to callers of outermost stack frame in inlined call chain.
1852 // This updates context ids for FirstNode's caller's to reflect those
1853 // moved to NewNode.
1854 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1855
1856 // Now we need to remove context ids from edges/nodes between First and
1857 // Last Node.
1858 PrevNode = nullptr;
1859 for (auto Id : Ids) {
1860 ContextNode *CurNode = getNodeForStackId(Id);
1861 // We should only have kept stack ids that had nodes.
1862 assert(CurNode);
1863
1864 // Remove the context ids moved to NewNode from CurNode, and the
1865 // edge from the prior node.
1866 if (PrevNode) {
1867 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1868 // If the sequence contained recursion, we might have already removed
1869 // some edges during the connectNewNode calls above.
1870 if (!PrevEdge) {
1871 PrevNode = CurNode;
1872 continue;
1873 }
1874 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1875 if (PrevEdge->getContextIds().empty())
1876 removeEdgeFromGraph(PrevEdge);
1877 }
1878 // Since we update the edges from leaf to tail, only look at the callee
1879 // edges. This isn't an alloc node, so if there are no callee edges, the
1880 // alloc type is None.
1881 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1882 ? (uint8_t)AllocationType::None
1883 : CurNode->computeAllocType();
1884 PrevNode = CurNode;
1885 }
1886
1887 recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
1888
1889 if (VerifyNodes) {
1890 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1891 for (auto Id : Ids) {
1892 ContextNode *CurNode = getNodeForStackId(Id);
1893 // We should only have kept stack ids that had nodes.
1894 assert(CurNode);
1895 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1896 }
1897 }
1898 }
1899}
1900
1901template <typename DerivedCCG, typename FuncTy, typename CallTy>
1902void CallsiteContextGraph<DerivedCCG, FuncTy,
1903 CallTy>::fixupImportantContexts() {
1904 if (ImportantContextIdInfo.empty())
1905 return;
1906
1907 // Update statistics as we are done building this map at this point.
1908 NumImportantContextIds = ImportantContextIdInfo.size();
1909
1911 return;
1912
1913 if (ExportToDot)
1914 exportToDot("beforestackfixup");
1915
1916 // For each context we identified as important, walk through the saved context
1917 // stack ids in order from leaf upwards, and make sure all edges are correct.
1918 // These can be difficult to get right when updating the graph while mapping
1919 // nodes onto summary or IR, especially when there is recursion. In
1920 // particular, when we have created new nodes to reflect inlining, it is
1921 // sometimes impossible to know exactly how to update the edges in the face of
1922 // recursion, as we have lost the original ordering of the stack ids in the
1923 // contexts.
1924 // TODO: Consider only doing this if we detect the context has recursive
1925 // cycles.
1926 //
1927 // I.e. assume we have a context with stack ids like: {A B A C A D E}
1928 // and let's say A was inlined into B, C, and D. The original graph will have
1929 // multiple recursive cycles through A. When we match the original context
1930 // nodes onto the IR or summary, we will merge {A B} into one context node,
1931 // {A C} onto another, and {A D} onto another. Looking at the stack sequence
1932 // above, we should end up with a non-cyclic set of edges like:
1933 // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
1934 // original ordering, we won't get the edges correct initially (it's
1935 // impossible without the original ordering). Here we do the fixup (add and
1936 // removing edges where necessary) for this context. In the
1937 // ImportantContextInfo struct in this case we should have a MaxLength = 2,
1938 // and map entries for {A B}, {A C}, {A D}, and {E}.
1939 for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
1940 if (Info.StackIdsToNode.empty())
1941 continue;
1942 bool Changed = false;
1943 ContextNode *PrevNode = nullptr;
1944 ContextNode *CurNode = nullptr;
1945 DenseSet<const ContextEdge *> VisitedEdges;
1946 ArrayRef<uint64_t> AllStackIds(Info.StackIds);
1947 // Try to identify what callsite ContextNode maps to which slice of the
1948 // context's ordered stack ids.
1949 for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
1950 // We will do this greedily, trying up to MaxLength stack ids in a row, to
1951 // see if we recorded a context node for that sequence.
1952 auto Len = Info.MaxLength;
1953 auto LenToEnd = AllStackIds.size() - I;
1954 if (Len > LenToEnd)
1955 Len = LenToEnd;
1956 CurNode = nullptr;
1957 // Try to find a recorded context node starting with the longest length
1958 // recorded, and on down until we check for just a single stack node.
1959 for (; Len > 0; Len--) {
1960 // Get the slice of the original stack id sequence to check.
1961 auto CheckStackIds = AllStackIds.slice(I, Len);
1962 auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
1963 if (EntryIt == Info.StackIdsToNode.end())
1964 continue;
1965 CurNode = EntryIt->second;
1966 // Skip forward so we don't try to look for the ones we just matched.
1967 // We increment by Len - 1, because the outer for loop will increment I.
1968 I += Len - 1;
1969 break;
1970 }
1971 // Give up if we couldn't find a node. Since we need to clone from the
1972 // leaf allocation upwards, no sense in doing anymore fixup further up
1973 // the context if we couldn't match part of the original stack context
1974 // onto a callsite node.
1975 if (!CurNode)
1976 break;
1977 // No edges to fix up until we have a pair of nodes that should be
1978 // adjacent in the graph.
1979 if (!PrevNode)
1980 continue;
1981 // See if we already have a call edge from CurNode to PrevNode.
1982 auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
1983 if (CurEdge) {
1984 // We already have an edge. Make sure it contains this context id.
1985 if (CurEdge->getContextIds().insert(CurContextId).second) {
1986 NumFixupEdgeIdsInserted++;
1987 Changed = true;
1988 }
1989 } else {
1990 // No edge exists - add one.
1991 NumFixupEdgesAdded++;
1992 DenseSet<uint32_t> ContextIds({CurContextId});
1993 auto AllocType = computeAllocType(ContextIds);
1994 auto NewEdge = std::make_shared<ContextEdge>(
1995 PrevNode, CurNode, AllocType, std::move(ContextIds));
1996 PrevNode->CallerEdges.push_back(NewEdge);
1997 CurNode->CalleeEdges.push_back(NewEdge);
1998 // Save the new edge for the below handling.
1999 CurEdge = NewEdge.get();
2000 Changed = true;
2001 }
2002 VisitedEdges.insert(CurEdge);
2003 // Now remove this context id from any other caller edges calling
2004 // PrevNode.
2005 for (auto &Edge : PrevNode->CallerEdges) {
2006 // Skip the edge updating/created above and edges we have already
2007 // visited (due to recursion).
2008 if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
2009 Edge->getContextIds().erase(CurContextId);
2010 }
2011 }
2012 if (Changed)
2013 NumFixedContexts++;
2014 }
2015}
2016
2017template <typename DerivedCCG, typename FuncTy, typename CallTy>
2018void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
2019 // Map of stack id to all calls with that as the last (outermost caller)
2020 // callsite id that has a context node (some might not due to pruning
2021 // performed during matching of the allocation profile contexts).
2022 // The CallContextInfo contains the Call and a list of its stack ids with
2023 // ContextNodes, the function containing Call, and the set of context ids
2024 // the analysis will eventually identify for use in any new node created
2025 // for that callsite.
2026 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
2027 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2028 for (auto &Call : CallsWithMetadata) {
2029 // Ignore allocations, already handled.
2030 if (AllocationCallToContextNodeMap.count(Call))
2031 continue;
2032 auto StackIdsWithContextNodes =
2033 getStackIdsWithContextNodesForCall(Call.call());
2034 // If there were no nodes created for MIBs on allocs (maybe this was in
2035 // the unambiguous part of the MIB stack that was pruned), ignore.
2036 if (StackIdsWithContextNodes.empty())
2037 continue;
2038 // Otherwise, record this Call along with the list of ids for the last
2039 // (outermost caller) stack id with a node.
2040 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
2041 {Call.call(), StackIdsWithContextNodes, Func, {}});
2042 }
2043 }
2044
2045 // First make a pass through all stack ids that correspond to a call,
2046 // as identified in the above loop. Compute the context ids corresponding to
2047 // each of these calls when they correspond to multiple stack ids due to
2048 // due to inlining. Perform any duplication of context ids required when
2049 // there is more than one call with the same stack ids. Their (possibly newly
2050 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
2051 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
2052 // Save a map from each call to any that are found to match it. I.e. located
2053 // in the same function and have the same (possibly pruned) stack ids. We use
2054 // this to avoid creating extra graph nodes as they can be treated the same.
2055 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
2056 for (auto &It : StackIdToMatchingCalls) {
2057 auto &Calls = It.getSecond();
2058 // Skip single calls with a single stack id. These don't need a new node.
2059 if (Calls.size() == 1) {
2060 auto &Ids = Calls[0].StackIds;
2061 if (Ids.size() == 1)
2062 continue;
2063 }
2064 // In order to do the best and maximal matching of inlined calls to context
2065 // node sequences we will sort the vectors of stack ids in descending order
2066 // of length, and within each length, lexicographically by stack id. The
2067 // latter is so that we can specially handle calls that have identical stack
2068 // id sequences (either due to cloning or artificially because of the MIB
2069 // context pruning). Those with the same Ids are then sorted by function to
2070 // facilitate efficiently mapping them to the same context node.
2071 // Because the functions are pointers, to ensure a stable sort first assign
2072 // each function pointer to its first index in the Calls array, and then use
2073 // that to sort by.
2074 DenseMap<const FuncTy *, unsigned> FuncToIndex;
2075 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
2076 FuncToIndex.insert({CallCtxInfo.Func, Idx});
2078 Calls,
2079 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
2080 return A.StackIds.size() > B.StackIds.size() ||
2081 (A.StackIds.size() == B.StackIds.size() &&
2082 (A.StackIds < B.StackIds ||
2083 (A.StackIds == B.StackIds &&
2084 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
2085 });
2086
2087 // Find the node for the last stack id, which should be the same
2088 // across all calls recorded for this id, and is the id for this
2089 // entry in the StackIdToMatchingCalls map.
2090 uint64_t LastId = It.getFirst();
2091 ContextNode *LastNode = getNodeForStackId(LastId);
2092 // We should only have kept stack ids that had nodes.
2093 assert(LastNode);
2094
2095 if (LastNode->Recursive)
2096 continue;
2097
2098 // Initialize the context ids with the last node's. We will subsequently
2099 // refine the context ids by computing the intersection along all edges.
2100 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
2101 assert(!LastNodeContextIds.empty());
2102
2103#ifndef NDEBUG
2104 // Save the set of functions seen for a particular set of the same stack
2105 // ids. This is used to ensure that they have been correctly sorted to be
2106 // adjacent in the Calls list, since we rely on that to efficiently place
2107 // all such matching calls onto the same context node.
2108 DenseSet<const FuncTy *> MatchingIdsFuncSet;
2109#endif
2110
2111 for (unsigned I = 0; I < Calls.size(); I++) {
2112 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
2113 assert(SavedContextIds.empty());
2114 assert(LastId == Ids.back());
2115
2116#ifndef NDEBUG
2117 // If this call has a different set of ids than the last one, clear the
2118 // set used to ensure they are sorted properly.
2119 if (I > 0 && Ids != Calls[I - 1].StackIds)
2120 MatchingIdsFuncSet.clear();
2121#endif
2122
2123 // First compute the context ids for this stack id sequence (the
2124 // intersection of the context ids of the corresponding nodes).
2125 // Start with the remaining saved ids for the last node.
2126 assert(!LastNodeContextIds.empty());
2127 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
2128
2129 ContextNode *PrevNode = LastNode;
2130 ContextNode *CurNode = LastNode;
2131 bool Skip = false;
2132
2133 // Iterate backwards through the stack Ids, starting after the last Id
2134 // in the list, which was handled once outside for all Calls.
2135 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
2136 auto Id = *IdIter;
2137 CurNode = getNodeForStackId(Id);
2138 // We should only have kept stack ids that had nodes.
2139 assert(CurNode);
2140
2141 if (CurNode->Recursive) {
2142 Skip = true;
2143 break;
2144 }
2145
2146 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
2147 // If there is no edge then the nodes belong to different MIB contexts,
2148 // and we should skip this inlined context sequence. For example, this
2149 // particular inlined context may include stack ids A->B, and we may
2150 // indeed have nodes for both A and B, but it is possible that they were
2151 // never profiled in sequence in a single MIB for any allocation (i.e.
2152 // we might have profiled an allocation that involves the callsite A,
2153 // but through a different one of its callee callsites, and we might
2154 // have profiled an allocation that involves callsite B, but reached
2155 // from a different caller callsite).
2156 if (!Edge) {
2157 Skip = true;
2158 break;
2159 }
2160 PrevNode = CurNode;
2161
2162 // Update the context ids, which is the intersection of the ids along
2163 // all edges in the sequence.
2164 set_intersect(StackSequenceContextIds, Edge->getContextIds());
2165
2166 // If we now have no context ids for clone, skip this call.
2167 if (StackSequenceContextIds.empty()) {
2168 Skip = true;
2169 break;
2170 }
2171 }
2172 if (Skip)
2173 continue;
2174
2175 // If some of this call's stack ids did not have corresponding nodes (due
2176 // to pruning), don't include any context ids for contexts that extend
2177 // beyond these nodes. Otherwise we would be matching part of unrelated /
2178 // not fully matching stack contexts. To do this, subtract any context ids
2179 // found in caller nodes of the last node found above.
2180 if (Ids.back() != getLastStackId(Call)) {
2181 for (const auto &PE : LastNode->CallerEdges) {
2182 set_subtract(StackSequenceContextIds, PE->getContextIds());
2183 if (StackSequenceContextIds.empty())
2184 break;
2185 }
2186 // If we now have no context ids for clone, skip this call.
2187 if (StackSequenceContextIds.empty())
2188 continue;
2189 }
2190
2191#ifndef NDEBUG
2192 // If the prior call had the same stack ids this set would not be empty.
2193 // Check if we already have a call that "matches" because it is located
2194 // in the same function. If the Calls list was sorted properly we should
2195 // not encounter this situation as all such entries should be adjacent
2196 // and processed in bulk further below.
2197 assert(!MatchingIdsFuncSet.contains(Func));
2198
2199 MatchingIdsFuncSet.insert(Func);
2200#endif
2201
2202 // Check if the next set of stack ids is the same (since the Calls vector
2203 // of tuples is sorted by the stack ids we can just look at the next one).
2204 // If so, save them in the CallToMatchingCall map so that they get
2205 // assigned to the same context node, and skip them.
2206 bool DuplicateContextIds = false;
2207 for (unsigned J = I + 1; J < Calls.size(); J++) {
2208 auto &CallCtxInfo = Calls[J];
2209 auto &NextIds = CallCtxInfo.StackIds;
2210 if (NextIds != Ids)
2211 break;
2212 auto *NextFunc = CallCtxInfo.Func;
2213 if (NextFunc != Func) {
2214 // We have another Call with the same ids but that cannot share this
2215 // node, must duplicate ids for it.
2216 DuplicateContextIds = true;
2217 break;
2218 }
2219 auto &NextCall = CallCtxInfo.Call;
2220 CallToMatchingCall[NextCall] = Call;
2221 // Update I so that it gets incremented correctly to skip this call.
2222 I = J;
2223 }
2224
2225 // If we don't have duplicate context ids, then we can assign all the
2226 // context ids computed for the original node sequence to this call.
2227 // If there are duplicate calls with the same stack ids then we synthesize
2228 // new context ids that are duplicates of the originals. These are
2229 // assigned to SavedContextIds, which is a reference into the map entry
2230 // for this call, allowing us to access these ids later on.
2231 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2232 StackSequenceContextIds.size());
2233 SavedContextIds =
2234 DuplicateContextIds
2235 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2236 : StackSequenceContextIds;
2237 assert(!SavedContextIds.empty());
2238
2239 if (!DuplicateContextIds) {
2240 // Update saved last node's context ids to remove those that are
2241 // assigned to other calls, so that it is ready for the next call at
2242 // this stack id.
2243 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2244 if (LastNodeContextIds.empty())
2245 break;
2246 }
2247 }
2248 }
2249
2250 // Propagate the duplicate context ids over the graph.
2251 propagateDuplicateContextIds(OldToNewContextIds);
2252
2253 if (VerifyCCG)
2254 check();
2255
2256 // Now perform a post-order traversal over the graph, starting with the
2257 // allocation nodes, essentially processing nodes from callers to callees.
2258 // For any that contains an id in the map, update the graph to contain new
2259 // nodes representing any inlining at interior callsites. Note we move the
2260 // associated context ids over to the new nodes.
2261 DenseSet<const ContextNode *> Visited;
2262 DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
2263 ImportantContextIdInfo.keys());
2264 for (auto &Entry : AllocationCallToContextNodeMap)
2265 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2266 CallToMatchingCall, ImportantContextIds);
2267
2268 fixupImportantContexts();
2269
2270 if (VerifyCCG)
2271 check();
2272}
2273
2274uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2275 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2276 Call->getMetadata(LLVMContext::MD_callsite));
2277 return CallsiteContext.back();
2278}
2279
2280uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2282 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2283 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2284 // Need to convert index into stack id.
2285 return Index.getStackIdAtIndex(CallsiteContext.back());
2286}
2287
2288static const std::string MemProfCloneSuffix = ".memprof.";
2289
2290static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2291 // We use CloneNo == 0 to refer to the original version, which doesn't get
2292 // renamed with a suffix.
2293 if (!CloneNo)
2294 return Base.str();
2295 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2296}
2297
2298static bool isMemProfClone(const Function &F) {
2299 return F.getName().contains(MemProfCloneSuffix);
2300}
2301
2302// Return the clone number of the given function by extracting it from the
2303// memprof suffix. Assumes the caller has already confirmed it is a memprof
2304// clone.
2305static unsigned getMemProfCloneNum(const Function &F) {
2307 auto Pos = F.getName().find_last_of('.');
2308 assert(Pos > 0);
2309 unsigned CloneNo;
2310 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2311 assert(!Err);
2312 (void)Err;
2313 return CloneNo;
2314}
2315
2316std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2317 const Instruction *Call,
2318 unsigned CloneNo) const {
2319 return (Twine(Call->getFunction()->getName()) + " -> " +
2320 cast<CallBase>(Call)->getCalledFunction()->getName())
2321 .str();
2322}
2323
2324std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2325 const IndexCall &Call,
2326 unsigned CloneNo) const {
2327 auto VI = FSToVIMap.find(Func);
2328 assert(VI != FSToVIMap.end());
2329 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2331 return CallerName + " -> alloc";
2332 else {
2333 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2334 return CallerName + " -> " +
2335 getMemProfFuncName(Callsite->Callee.name(),
2336 Callsite->Clones[CloneNo]);
2337 }
2338}
2339
2340std::vector<uint64_t>
2341ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2342 Instruction *Call) {
2343 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2344 Call->getMetadata(LLVMContext::MD_callsite));
2345 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2346 CallsiteContext);
2347}
2348
2349std::vector<uint64_t>
2350IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2352 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2353 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2354 return getStackIdsWithContextNodes<CallsiteInfo,
2355 SmallVector<unsigned>::const_iterator>(
2356 CallsiteContext);
2357}
2358
2359template <typename DerivedCCG, typename FuncTy, typename CallTy>
2360template <class NodeT, class IteratorT>
2361std::vector<uint64_t>
2362CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2363 CallStack<NodeT, IteratorT> &CallsiteContext) {
2364 std::vector<uint64_t> StackIds;
2365 for (auto IdOrIndex : CallsiteContext) {
2366 auto StackId = getStackId(IdOrIndex);
2367 ContextNode *Node = getNodeForStackId(StackId);
2368 if (!Node)
2369 break;
2370 StackIds.push_back(StackId);
2371 }
2372 return StackIds;
2373}
2374
2375ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2376 Module &M,
2377 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2378 : Mod(M), OREGetter(OREGetter) {
2379 // Map for keeping track of the largest cold contexts up to the number given
2380 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2381 // must be sorted.
2382 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2383 for (auto &F : M) {
2384 std::vector<CallInfo> CallsWithMetadata;
2385 for (auto &BB : F) {
2386 for (auto &I : BB) {
2387 if (!isa<CallBase>(I))
2388 continue;
2389 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2390 CallsWithMetadata.push_back(&I);
2391 auto *AllocNode = addAllocNode(&I, &F);
2392 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2393 assert(CallsiteMD);
2394 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2395 // Add all of the MIBs and their stack nodes.
2396 for (auto &MDOp : MemProfMD->operands()) {
2397 auto *MIBMD = cast<const MDNode>(MDOp);
2398 std::vector<ContextTotalSize> ContextSizeInfo;
2399 // Collect the context size information if it exists.
2400 if (MIBMD->getNumOperands() > 2) {
2401 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2402 MDNode *ContextSizePair =
2403 dyn_cast<MDNode>(MIBMD->getOperand(I));
2404 assert(ContextSizePair->getNumOperands() == 2);
2406 ContextSizePair->getOperand(0))
2407 ->getZExtValue();
2409 ContextSizePair->getOperand(1))
2410 ->getZExtValue();
2411 ContextSizeInfo.push_back({FullStackId, TotalSize});
2412 }
2413 }
2417 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2418 AllocNode, StackContext, CallsiteContext,
2419 getMIBAllocType(MIBMD), ContextSizeInfo,
2420 TotalSizeToContextIdTopNCold);
2421 }
2422 // If exporting the graph to dot and an allocation id of interest was
2423 // specified, record all the context ids for this allocation node.
2424 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2425 DotAllocContextIds = AllocNode->getContextIds();
2426 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2427 // Memprof and callsite metadata on memory allocations no longer
2428 // needed.
2429 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2430 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2431 }
2432 // For callsite metadata, add to list for this function for later use.
2433 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2434 CallsWithMetadata.push_back(&I);
2435 }
2436 }
2437 }
2438 if (!CallsWithMetadata.empty())
2439 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2440 }
2441
2442 if (DumpCCG) {
2443 dbgs() << "CCG before updating call stack chains:\n";
2444 dbgs() << *this;
2445 }
2446
2447 if (ExportToDot)
2448 exportToDot("prestackupdate");
2449
2450 updateStackNodes();
2451
2452 if (ExportToDot)
2453 exportToDot("poststackupdate");
2454
2455 handleCallsitesWithMultipleTargets();
2456
2457 markBackedges();
2458
2459 // Strip off remaining callsite metadata, no longer needed.
2460 for (auto &FuncEntry : FuncToCallsWithMetadata)
2461 for (auto &Call : FuncEntry.second)
2462 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2463}
2464
2465IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2466 ModuleSummaryIndex &Index,
2468 isPrevailing)
2469 : Index(Index), isPrevailing(isPrevailing) {
2470 // Map for keeping track of the largest cold contexts up to the number given
2471 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2472 // must be sorted.
2473 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2474 for (auto &I : Index) {
2475 auto VI = Index.getValueInfo(I);
2476 for (auto &S : VI.getSummaryList()) {
2477 // We should only add the prevailing nodes. Otherwise we may try to clone
2478 // in a weak copy that won't be linked (and may be different than the
2479 // prevailing version).
2480 // We only keep the memprof summary on the prevailing copy now when
2481 // building the combined index, as a space optimization, however don't
2482 // rely on this optimization. The linker doesn't resolve local linkage
2483 // values so don't check whether those are prevailing.
2484 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2485 !isPrevailing(VI.getGUID(), S.get()))
2486 continue;
2487 auto *FS = dyn_cast<FunctionSummary>(S.get());
2488 if (!FS)
2489 continue;
2490 std::vector<CallInfo> CallsWithMetadata;
2491 if (!FS->allocs().empty()) {
2492 for (auto &AN : FS->mutableAllocs()) {
2493 // This can happen because of recursion elimination handling that
2494 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2495 // We still added them to the summary because we need to be able to
2496 // correlate properly in applyImport in the backends.
2497 if (AN.MIBs.empty())
2498 continue;
2499 IndexCall AllocCall(&AN);
2500 CallsWithMetadata.push_back(AllocCall);
2501 auto *AllocNode = addAllocNode(AllocCall, FS);
2502 // Pass an empty CallStack to the CallsiteContext (second)
2503 // parameter, since for ThinLTO we already collapsed out the inlined
2504 // stack ids on the allocation call during ModuleSummaryAnalysis.
2506 EmptyContext;
2507 unsigned I = 0;
2509 AN.ContextSizeInfos.size() == AN.MIBs.size());
2510 // Now add all of the MIBs and their stack nodes.
2511 for (auto &MIB : AN.MIBs) {
2513 StackContext(&MIB);
2514 std::vector<ContextTotalSize> ContextSizeInfo;
2515 if (!AN.ContextSizeInfos.empty()) {
2516 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2517 ContextSizeInfo.push_back({FullStackId, TotalSize});
2518 }
2519 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2520 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2521 ContextSizeInfo, TotalSizeToContextIdTopNCold);
2522 I++;
2523 }
2524 // If exporting the graph to dot and an allocation id of interest was
2525 // specified, record all the context ids for this allocation node.
2526 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2527 DotAllocContextIds = AllocNode->getContextIds();
2528 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2529 // Initialize version 0 on the summary alloc node to the current alloc
2530 // type, unless it has both types in which case make it default, so
2531 // that in the case where we aren't able to clone the original version
2532 // always ends up with the default allocation behavior.
2533 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2534 }
2535 }
2536 // For callsite metadata, add to list for this function for later use.
2537 if (!FS->callsites().empty())
2538 for (auto &SN : FS->mutableCallsites()) {
2539 IndexCall StackNodeCall(&SN);
2540 CallsWithMetadata.push_back(StackNodeCall);
2541 }
2542
2543 if (!CallsWithMetadata.empty())
2544 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2545
2546 if (!FS->allocs().empty() || !FS->callsites().empty())
2547 FSToVIMap[FS] = VI;
2548 }
2549 }
2550
2551 if (DumpCCG) {
2552 dbgs() << "CCG before updating call stack chains:\n";
2553 dbgs() << *this;
2554 }
2555
2556 if (ExportToDot)
2557 exportToDot("prestackupdate");
2558
2559 updateStackNodes();
2560
2561 if (ExportToDot)
2562 exportToDot("poststackupdate");
2563
2564 handleCallsitesWithMultipleTargets();
2565
2566 markBackedges();
2567}
2568
2569template <typename DerivedCCG, typename FuncTy, typename CallTy>
2570void CallsiteContextGraph<DerivedCCG, FuncTy,
2571 CallTy>::handleCallsitesWithMultipleTargets() {
2572 // Look for and workaround callsites that call multiple functions.
2573 // This can happen for indirect calls, which needs better handling, and in
2574 // more rare cases (e.g. macro expansion).
2575 // TODO: To fix this for indirect calls we will want to perform speculative
2576 // devirtualization using either the normal PGO info with ICP, or using the
2577 // information in the profiled MemProf contexts. We can do this prior to
2578 // this transformation for regular LTO, and for ThinLTO we can simulate that
2579 // effect in the summary and perform the actual speculative devirtualization
2580 // while cloning in the ThinLTO backend.
2581
2582 // Keep track of the new nodes synthesized for discovered tail calls missing
2583 // from the profiled contexts.
2584 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2585
2586 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2587 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2588 auto *Node = Entry.second;
2589 assert(Node->Clones.empty());
2590 // Check all node callees and see if in the same function.
2591 // We need to check all of the calls recorded in this Node, because in some
2592 // cases we may have had multiple calls with the same debug info calling
2593 // different callees. This can happen, for example, when an object is
2594 // constructed in the paramter list - the destructor call of the object has
2595 // the same debug info (line/col) as the call the object was passed to.
2596 // Here we will prune any that don't match all callee nodes.
2597 std::vector<CallInfo> AllCalls;
2598 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2599 AllCalls.push_back(Node->Call);
2600 llvm::append_range(AllCalls, Node->MatchingCalls);
2601
2602 // First see if we can partition the calls by callee function, creating new
2603 // nodes to host each set of calls calling the same callees. This is
2604 // necessary for support indirect calls with ThinLTO, for which we
2605 // synthesized CallsiteInfo records for each target. They will all have the
2606 // same callsite stack ids and would be sharing a context node at this
2607 // point. We need to perform separate cloning for each, which will be
2608 // applied along with speculative devirtualization in the ThinLTO backends
2609 // as needed. Note this does not currently support looking through tail
2610 // calls, it is unclear if we need that for indirect call targets.
2611 // First partition calls by callee func. Map indexed by func, value is
2612 // struct with list of matching calls, assigned node.
2613 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2614 continue;
2615
2616 auto It = AllCalls.begin();
2617 // Iterate through the calls until we find the first that matches.
2618 for (; It != AllCalls.end(); ++It) {
2619 auto ThisCall = *It;
2620 bool Match = true;
2621 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2622 ++EI) {
2623 auto Edge = *EI;
2624 if (!Edge->Callee->hasCall())
2625 continue;
2626 assert(NodeToCallingFunc.count(Edge->Callee));
2627 // Check if the called function matches that of the callee node.
2628 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2629 Match = false;
2630 break;
2631 }
2632 }
2633 // Found a call that matches the callee nodes, we can quit now.
2634 if (Match) {
2635 // If the first match is not the primary call on the Node, update it
2636 // now. We will update the list of matching calls further below.
2637 if (Node->Call != ThisCall) {
2638 Node->setCall(ThisCall);
2639 // We need to update the NonAllocationCallToContextNodeMap, but don't
2640 // want to do this during iteration over that map, so save the calls
2641 // that need updated entries.
2642 NewCallToNode.push_back({ThisCall, Node});
2643 }
2644 break;
2645 }
2646 }
2647 // We will update this list below (or leave it cleared if there was no
2648 // match found above).
2649 Node->MatchingCalls.clear();
2650 // If we hit the end of the AllCalls vector, no call matching the callee
2651 // nodes was found, clear the call information in the node.
2652 if (It == AllCalls.end()) {
2653 RemovedEdgesWithMismatchedCallees++;
2654 // Work around by setting Node to have a null call, so it gets
2655 // skipped during cloning. Otherwise assignFunctions will assert
2656 // because its data structures are not designed to handle this case.
2657 Node->setCall(CallInfo());
2658 continue;
2659 }
2660 // Now add back any matching calls that call the same function as the
2661 // matching primary call on Node.
2662 for (++It; It != AllCalls.end(); ++It) {
2663 auto ThisCall = *It;
2664 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2665 continue;
2666 Node->MatchingCalls.push_back(ThisCall);
2667 }
2668 }
2669
2670 // Remove all mismatched nodes identified in the above loop from the node map
2671 // (checking whether they have a null call which is set above). For a
2672 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2673 // to do the removal via remove_if than by individually erasing entries above.
2674 // Also remove any entries if we updated the node's primary call above.
2675 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2676 return !it.second->hasCall() || it.second->Call != it.first;
2677 });
2678
2679 // Add entries for any new primary calls recorded above.
2680 for (auto &[Call, Node] : NewCallToNode)
2681 NonAllocationCallToContextNodeMap[Call] = Node;
2682
2683 // Add the new nodes after the above loop so that the iteration is not
2684 // invalidated.
2685 for (auto &[Call, Node] : TailCallToContextNodeMap)
2686 NonAllocationCallToContextNodeMap[Call] = Node;
2687}
2688
2689template <typename DerivedCCG, typename FuncTy, typename CallTy>
2690bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2691 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2692 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2693 // Struct to keep track of all the calls having the same callee function,
2694 // and the node we eventually assign to them. Eventually we will record the
2695 // context node assigned to this group of calls.
2696 struct CallsWithSameCallee {
2697 std::vector<CallInfo> Calls;
2698 ContextNode *Node = nullptr;
2699 };
2700
2701 // First partition calls by callee function. Build map from each function
2702 // to the list of matching calls.
2704 for (auto ThisCall : AllCalls) {
2705 auto *F = getCalleeFunc(ThisCall.call());
2706 if (F)
2707 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2708 }
2709
2710 // Next, walk through all callee edges. For each callee node, get its
2711 // containing function and see if it was recorded in the above map (meaning we
2712 // have at least one matching call). Build another map from each callee node
2713 // with a matching call to the structure instance created above containing all
2714 // the calls.
2716 for (const auto &Edge : Node->CalleeEdges) {
2717 if (!Edge->Callee->hasCall())
2718 continue;
2719 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2720 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2721 CalleeNodeToCallInfo[Edge->Callee] =
2722 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2723 }
2724
2725 // If there are entries in the second map, then there were no matching
2726 // calls/callees, nothing to do here. Return so we can go to the handling that
2727 // looks through tail calls.
2728 if (CalleeNodeToCallInfo.empty())
2729 return false;
2730
2731 // Walk through all callee edges again. Any and all callee edges that didn't
2732 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2733 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2734 // ignored during cloning. If it is in the map, then we use the node recorded
2735 // in that entry (creating it if needed), and move the callee edge to it.
2736 // The first callee will use the original node instead of creating a new one.
2737 // Note that any of the original calls on this node (in AllCalls) that didn't
2738 // have a callee function automatically get dropped from the node as part of
2739 // this process.
2740 ContextNode *UnmatchedCalleesNode = nullptr;
2741 // Track whether we already assigned original node to a callee.
2742 bool UsedOrigNode = false;
2743 assert(NodeToCallingFunc[Node]);
2744 // Iterate over a copy of Node's callee edges, since we may need to remove
2745 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2746 // makes it less error-prone.
2747 auto CalleeEdges = Node->CalleeEdges;
2748 for (auto &Edge : CalleeEdges) {
2749 if (!Edge->Callee->hasCall())
2750 continue;
2751
2752 // Will be updated below to point to whatever (caller) node this callee edge
2753 // should be moved to.
2754 ContextNode *CallerNodeToUse = nullptr;
2755
2756 // Handle the case where there were no matching calls first. Move this
2757 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2758 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2759 if (!UnmatchedCalleesNode)
2760 UnmatchedCalleesNode =
2761 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2762 CallerNodeToUse = UnmatchedCalleesNode;
2763 } else {
2764 // Look up the information recorded for this callee node, and use the
2765 // recorded caller node (creating it if needed).
2766 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2767 if (!Info->Node) {
2768 // If we haven't assigned any callees to the original node use it.
2769 if (!UsedOrigNode) {
2770 Info->Node = Node;
2771 // Clear the set of matching calls which will be updated below.
2772 Node->MatchingCalls.clear();
2773 UsedOrigNode = true;
2774 } else
2775 Info->Node =
2776 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2777 assert(!Info->Calls.empty());
2778 // The first call becomes the primary call for this caller node, and the
2779 // rest go in the matching calls list.
2780 Info->Node->setCall(Info->Calls.front());
2781 llvm::append_range(Info->Node->MatchingCalls,
2782 llvm::drop_begin(Info->Calls));
2783 // Save the primary call to node correspondence so that we can update
2784 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2785 // caller of this function.
2786 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2787 }
2788 CallerNodeToUse = Info->Node;
2789 }
2790
2791 // Don't need to move edge if we are using the original node;
2792 if (CallerNodeToUse == Node)
2793 continue;
2794
2795 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2796 }
2797 // Now that we are done moving edges, clean up any caller edges that ended
2798 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2799 // caller edges from Node are replicated onto the new callers, and it
2800 // simplifies the handling to leave them until we have moved all
2801 // edges/context ids.
2802 for (auto &I : CalleeNodeToCallInfo)
2803 removeNoneTypeCallerEdges(I.second->Node);
2804 if (UnmatchedCalleesNode)
2805 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2806 removeNoneTypeCallerEdges(Node);
2807
2808 return true;
2809}
2810
2811uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2812 // In the Module (IR) case this is already the Id.
2813 return IdOrIndex;
2814}
2815
2816uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2817 // In the Index case this is an index into the stack id list in the summary
2818 // index, convert it to an Id.
2819 return Index.getStackIdAtIndex(IdOrIndex);
2820}
2821
2822template <typename DerivedCCG, typename FuncTy, typename CallTy>
2823bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2824 CallTy Call, EdgeIter &EI,
2825 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2826 auto Edge = *EI;
2827 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2828 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2829 // Will be populated in order of callee to caller if we find a chain of tail
2830 // calls between the profiled caller and callee.
2831 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2832 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2833 FoundCalleeChain))
2834 return false;
2835
2836 // The usual case where the profiled callee matches that of the IR/summary.
2837 if (FoundCalleeChain.empty())
2838 return true;
2839
2840 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2841 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2842 // If there is already an edge between these nodes, simply update it and
2843 // return.
2844 if (CurEdge) {
2845 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2846 CurEdge->AllocTypes |= Edge->AllocTypes;
2847 return;
2848 }
2849 // Otherwise, create a new edge and insert it into the caller and callee
2850 // lists.
2851 auto NewEdge = std::make_shared<ContextEdge>(
2852 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2853 Callee->CallerEdges.push_back(NewEdge);
2854 if (Caller == Edge->Caller) {
2855 // If we are inserting the new edge into the current edge's caller, insert
2856 // the new edge before the current iterator position, and then increment
2857 // back to the current edge.
2858 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2859 ++EI;
2860 assert(*EI == Edge &&
2861 "Iterator position not restored after insert and increment");
2862 } else
2863 Caller->CalleeEdges.push_back(NewEdge);
2864 };
2865
2866 // Create new nodes for each found callee and connect in between the profiled
2867 // caller and callee.
2868 auto *CurCalleeNode = Edge->Callee;
2869 for (auto &[NewCall, Func] : FoundCalleeChain) {
2870 ContextNode *NewNode = nullptr;
2871 // First check if we have already synthesized a node for this tail call.
2872 if (TailCallToContextNodeMap.count(NewCall)) {
2873 NewNode = TailCallToContextNodeMap[NewCall];
2874 NewNode->AllocTypes |= Edge->AllocTypes;
2875 } else {
2876 FuncToCallsWithMetadata[Func].push_back({NewCall});
2877 // Create Node and record node info.
2878 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2879 TailCallToContextNodeMap[NewCall] = NewNode;
2880 NewNode->AllocTypes = Edge->AllocTypes;
2881 }
2882
2883 // Hook up node to its callee node
2884 AddEdge(NewNode, CurCalleeNode);
2885
2886 CurCalleeNode = NewNode;
2887 }
2888
2889 // Hook up edge's original caller to new callee node.
2890 AddEdge(Edge->Caller, CurCalleeNode);
2891
2892#ifndef NDEBUG
2893 // Save this because Edge's fields get cleared below when removed.
2894 auto *Caller = Edge->Caller;
2895#endif
2896
2897 // Remove old edge
2898 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2899
2900 // To simplify the increment of EI in the caller, subtract one from EI.
2901 // In the final AddEdge call we would have either added a new callee edge,
2902 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2903 // that there is at least one callee edge.
2904 assert(!Caller->CalleeEdges.empty());
2905 --EI;
2906
2907 return true;
2908}
2909
2910bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2911 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2912 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2913 bool &FoundMultipleCalleeChains) {
2914 // Stop recursive search if we have already explored the maximum specified
2915 // depth.
2917 return false;
2918
2919 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2920 FoundCalleeChain.push_back({Callsite, F});
2921 };
2922
2923 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2924 if (!CalleeFunc) {
2925 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2926 assert(Alias);
2927 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2928 assert(CalleeFunc);
2929 }
2930
2931 // Look for tail calls in this function, and check if they either call the
2932 // profiled callee directly, or indirectly (via a recursive search).
2933 // Only succeed if there is a single unique tail call chain found between the
2934 // profiled caller and callee, otherwise we could perform incorrect cloning.
2935 bool FoundSingleCalleeChain = false;
2936 for (auto &BB : *CalleeFunc) {
2937 for (auto &I : BB) {
2938 auto *CB = dyn_cast<CallBase>(&I);
2939 if (!CB || !CB->isTailCall())
2940 continue;
2941 auto *CalledValue = CB->getCalledOperand();
2942 auto *CalledFunction = CB->getCalledFunction();
2943 if (CalledValue && !CalledFunction) {
2944 CalledValue = CalledValue->stripPointerCasts();
2945 // Stripping pointer casts can reveal a called function.
2946 CalledFunction = dyn_cast<Function>(CalledValue);
2947 }
2948 // Check if this is an alias to a function. If so, get the
2949 // called aliasee for the checks below.
2950 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2951 assert(!CalledFunction &&
2952 "Expected null called function in callsite for alias");
2953 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2954 }
2955 if (!CalledFunction)
2956 continue;
2957 if (CalledFunction == ProfiledCallee) {
2958 if (FoundSingleCalleeChain) {
2959 FoundMultipleCalleeChains = true;
2960 return false;
2961 }
2962 FoundSingleCalleeChain = true;
2963 FoundProfiledCalleeCount++;
2964 FoundProfiledCalleeDepth += Depth;
2965 if (Depth > FoundProfiledCalleeMaxDepth)
2966 FoundProfiledCalleeMaxDepth = Depth;
2967 SaveCallsiteInfo(&I, CalleeFunc);
2968 } else if (findProfiledCalleeThroughTailCalls(
2969 ProfiledCallee, CalledFunction, Depth + 1,
2970 FoundCalleeChain, FoundMultipleCalleeChains)) {
2971 // findProfiledCalleeThroughTailCalls should not have returned
2972 // true if FoundMultipleCalleeChains.
2973 assert(!FoundMultipleCalleeChains);
2974 if (FoundSingleCalleeChain) {
2975 FoundMultipleCalleeChains = true;
2976 return false;
2977 }
2978 FoundSingleCalleeChain = true;
2979 SaveCallsiteInfo(&I, CalleeFunc);
2980 } else if (FoundMultipleCalleeChains)
2981 return false;
2982 }
2983 }
2984
2985 return FoundSingleCalleeChain;
2986}
2987
2988const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2989 auto *CB = dyn_cast<CallBase>(Call);
2990 if (!CB->getCalledOperand() || CB->isIndirectCall())
2991 return nullptr;
2992 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2993 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2994 if (Alias)
2995 return dyn_cast<Function>(Alias->getAliasee());
2996 return dyn_cast<Function>(CalleeVal);
2997}
2998
2999bool ModuleCallsiteContextGraph::calleeMatchesFunc(
3000 Instruction *Call, const Function *Func, const Function *CallerFunc,
3001 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
3002 auto *CB = dyn_cast<CallBase>(Call);
3003 if (!CB->getCalledOperand() || CB->isIndirectCall())
3004 return false;
3005 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3006 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
3007 if (CalleeFunc == Func)
3008 return true;
3009 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
3010 if (Alias && Alias->getAliasee() == Func)
3011 return true;
3012
3013 // Recursively search for the profiled callee through tail calls starting with
3014 // the actual Callee. The discovered tail call chain is saved in
3015 // FoundCalleeChain, and we will fixup the graph to include these callsites
3016 // after returning.
3017 // FIXME: We will currently redo the same recursive walk if we find the same
3018 // mismatched callee from another callsite. We can improve this with more
3019 // bookkeeping of the created chain of new nodes for each mismatch.
3020 unsigned Depth = 1;
3021 bool FoundMultipleCalleeChains = false;
3022 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
3023 FoundCalleeChain,
3024 FoundMultipleCalleeChains)) {
3025 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
3026 << Func->getName() << " from " << CallerFunc->getName()
3027 << " that actually called " << CalleeVal->getName()
3028 << (FoundMultipleCalleeChains
3029 ? " (found multiple possible chains)"
3030 : "")
3031 << "\n");
3032 if (FoundMultipleCalleeChains)
3033 FoundProfiledCalleeNonUniquelyCount++;
3034 return false;
3035 }
3036
3037 return true;
3038}
3039
3040bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
3041 Instruction *Call2) {
3042 auto *CB1 = cast<CallBase>(Call1);
3043 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
3044 return false;
3045 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
3046 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
3047 auto *CB2 = cast<CallBase>(Call2);
3048 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
3049 return false;
3050 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
3051 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
3052 return CalleeFunc1 == CalleeFunc2;
3053}
3054
3055bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
3056 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
3057 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
3058 bool &FoundMultipleCalleeChains) {
3059 // Stop recursive search if we have already explored the maximum specified
3060 // depth.
3062 return false;
3063
3064 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
3065 // Make a CallsiteInfo for each discovered callee, if one hasn't already
3066 // been synthesized.
3067 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
3068 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
3069 // StackIds is empty (we don't have debug info available in the index for
3070 // these callsites)
3071 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
3072 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
3073 CallsiteInfo *NewCallsiteInfo =
3074 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
3075 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
3076 };
3077
3078 // Look for tail calls in this function, and check if they either call the
3079 // profiled callee directly, or indirectly (via a recursive search).
3080 // Only succeed if there is a single unique tail call chain found between the
3081 // profiled caller and callee, otherwise we could perform incorrect cloning.
3082 bool FoundSingleCalleeChain = false;
3083 for (auto &S : CurCallee.getSummaryList()) {
3084 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
3085 !isPrevailing(CurCallee.getGUID(), S.get()))
3086 continue;
3087 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
3088 if (!FS)
3089 continue;
3090 auto FSVI = CurCallee;
3091 auto *AS = dyn_cast<AliasSummary>(S.get());
3092 if (AS)
3093 FSVI = AS->getAliaseeVI();
3094 for (auto &CallEdge : FS->calls()) {
3095 if (!CallEdge.second.hasTailCall())
3096 continue;
3097 if (CallEdge.first == ProfiledCallee) {
3098 if (FoundSingleCalleeChain) {
3099 FoundMultipleCalleeChains = true;
3100 return false;
3101 }
3102 FoundSingleCalleeChain = true;
3103 FoundProfiledCalleeCount++;
3104 FoundProfiledCalleeDepth += Depth;
3105 if (Depth > FoundProfiledCalleeMaxDepth)
3106 FoundProfiledCalleeMaxDepth = Depth;
3107 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3108 // Add FS to FSToVIMap in case it isn't already there.
3109 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3110 FSToVIMap[FS] = FSVI;
3111 } else if (findProfiledCalleeThroughTailCalls(
3112 ProfiledCallee, CallEdge.first, Depth + 1,
3113 FoundCalleeChain, FoundMultipleCalleeChains)) {
3114 // findProfiledCalleeThroughTailCalls should not have returned
3115 // true if FoundMultipleCalleeChains.
3116 assert(!FoundMultipleCalleeChains);
3117 if (FoundSingleCalleeChain) {
3118 FoundMultipleCalleeChains = true;
3119 return false;
3120 }
3121 FoundSingleCalleeChain = true;
3122 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3123 // Add FS to FSToVIMap in case it isn't already there.
3124 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3125 FSToVIMap[FS] = FSVI;
3126 } else if (FoundMultipleCalleeChains)
3127 return false;
3128 }
3129 }
3130
3131 return FoundSingleCalleeChain;
3132}
3133
3134const FunctionSummary *
3135IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
3136 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3137 if (Callee.getSummaryList().empty())
3138 return nullptr;
3139 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
3140}
3141
3142bool IndexCallsiteContextGraph::calleeMatchesFunc(
3143 IndexCall &Call, const FunctionSummary *Func,
3144 const FunctionSummary *CallerFunc,
3145 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
3146 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3147 // If there is no summary list then this is a call to an externally defined
3148 // symbol.
3149 AliasSummary *Alias =
3150 Callee.getSummaryList().empty()
3151 ? nullptr
3152 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
3153 assert(FSToVIMap.count(Func));
3154 auto FuncVI = FSToVIMap[Func];
3155 if (Callee == FuncVI ||
3156 // If callee is an alias, check the aliasee, since only function
3157 // summary base objects will contain the stack node summaries and thus
3158 // get a context node.
3159 (Alias && Alias->getAliaseeVI() == FuncVI))
3160 return true;
3161
3162 // Recursively search for the profiled callee through tail calls starting with
3163 // the actual Callee. The discovered tail call chain is saved in
3164 // FoundCalleeChain, and we will fixup the graph to include these callsites
3165 // after returning.
3166 // FIXME: We will currently redo the same recursive walk if we find the same
3167 // mismatched callee from another callsite. We can improve this with more
3168 // bookkeeping of the created chain of new nodes for each mismatch.
3169 unsigned Depth = 1;
3170 bool FoundMultipleCalleeChains = false;
3171 if (!findProfiledCalleeThroughTailCalls(
3172 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
3173 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
3174 << " from " << FSToVIMap[CallerFunc]
3175 << " that actually called " << Callee
3176 << (FoundMultipleCalleeChains
3177 ? " (found multiple possible chains)"
3178 : "")
3179 << "\n");
3180 if (FoundMultipleCalleeChains)
3181 FoundProfiledCalleeNonUniquelyCount++;
3182 return false;
3183 }
3184
3185 return true;
3186}
3187
3188bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
3189 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
3190 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
3191 return Callee1 == Callee2;
3192}
3193
3194template <typename DerivedCCG, typename FuncTy, typename CallTy>
3195void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
3196 const {
3197 print(dbgs());
3198 dbgs() << "\n";
3199}
3200
3201template <typename DerivedCCG, typename FuncTy, typename CallTy>
3202void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
3203 raw_ostream &OS) const {
3204 OS << "Node " << this << "\n";
3205 OS << "\t";
3206 printCall(OS);
3207 if (Recursive)
3208 OS << " (recursive)";
3209 OS << "\n";
3210 if (!MatchingCalls.empty()) {
3211 OS << "\tMatchingCalls:\n";
3212 for (auto &MatchingCall : MatchingCalls) {
3213 OS << "\t";
3214 MatchingCall.print(OS);
3215 OS << "\n";
3216 }
3217 }
3218 OS << "\tNodeId: " << NodeId << "\n";
3219 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
3220 OS << "\tContextIds:";
3221 // Make a copy of the computed context ids that we can sort for stability.
3222 auto ContextIds = getContextIds();
3223 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3224 std::sort(SortedIds.begin(), SortedIds.end());
3225 for (auto Id : SortedIds)
3226 OS << " " << Id;
3227 OS << "\n";
3228 OS << "\tCalleeEdges:\n";
3229 for (auto &Edge : CalleeEdges)
3230 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3231 << ")\n";
3232 OS << "\tCallerEdges:\n";
3233 for (auto &Edge : CallerEdges)
3234 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3235 << ")\n";
3236 if (!Clones.empty()) {
3237 OS << "\tClones: ";
3238 bool First = true;
3239 for (auto *C : Clones) {
3240 if (!First)
3241 OS << ", ";
3242 First = false;
3243 OS << C << " NodeId: " << C->NodeId;
3244 }
3245 OS << "\n";
3246 } else if (CloneOf) {
3247 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3248 }
3249}
3250
3251template <typename DerivedCCG, typename FuncTy, typename CallTy>
3252void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3253 const {
3254 print(dbgs());
3255 dbgs() << "\n";
3256}
3257
3258template <typename DerivedCCG, typename FuncTy, typename CallTy>
3259void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3260 raw_ostream &OS) const {
3261 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3262 << (IsBackedge ? " (BE)" : "")
3263 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3264 OS << " ContextIds:";
3265 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3266 std::sort(SortedIds.begin(), SortedIds.end());
3267 for (auto Id : SortedIds)
3268 OS << " " << Id;
3269}
3270
3271template <typename DerivedCCG, typename FuncTy, typename CallTy>
3272void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3273 print(dbgs());
3274}
3275
3276template <typename DerivedCCG, typename FuncTy, typename CallTy>
3277void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3278 raw_ostream &OS) const {
3279 OS << "Callsite Context Graph:\n";
3280 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3281 for (const auto Node : nodes<GraphType>(this)) {
3282 if (Node->isRemoved())
3283 continue;
3284 Node->print(OS);
3285 OS << "\n";
3286 }
3287}
3288
3289template <typename DerivedCCG, typename FuncTy, typename CallTy>
3290void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3291 raw_ostream &OS) const {
3292 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3293 for (const auto Node : nodes<GraphType>(this)) {
3294 if (Node->isRemoved())
3295 continue;
3296 if (!Node->IsAllocation)
3297 continue;
3298 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3299 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3300 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3301 std::sort(SortedIds.begin(), SortedIds.end());
3302 for (auto Id : SortedIds) {
3303 auto TypeI = ContextIdToAllocationType.find(Id);
3304 assert(TypeI != ContextIdToAllocationType.end());
3305 auto CSI = ContextIdToContextSizeInfos.find(Id);
3306 if (CSI != ContextIdToContextSizeInfos.end()) {
3307 for (auto &Info : CSI->second) {
3308 OS << "MemProf hinting: "
3309 << getAllocTypeString((uint8_t)TypeI->second)
3310 << " full allocation context " << Info.FullStackId
3311 << " with total size " << Info.TotalSize << " is "
3312 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3313 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3314 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3315 << " due to cold byte percent";
3316 // Print the internal context id to aid debugging and visualization.
3317 OS << " (context id " << Id << ")";
3318 OS << "\n";
3319 }
3320 }
3321 }
3322 }
3323}
3324
3325template <typename DerivedCCG, typename FuncTy, typename CallTy>
3326void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3327 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3328 for (const auto Node : nodes<GraphType>(this)) {
3329 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3330 for (auto &Edge : Node->CallerEdges)
3332 }
3333}
3334
3335template <typename DerivedCCG, typename FuncTy, typename CallTy>
3336struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3337 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3338 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3339
3340 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3341 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3342
3345 decltype(&getNode)>;
3346
3348 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3349 }
3350
3352 return nodes_iterator(G->NodeOwner.end(), &getNode);
3353 }
3354
3356 return G->NodeOwner.begin()->get();
3357 }
3358
3359 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3360 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3362 return P->Callee;
3363 }
3364
3366 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3367 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3368 decltype(&GetCallee)>;
3369
3371 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3372 }
3373
3375 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3376 }
3377};
3378
3379template <typename DerivedCCG, typename FuncTy, typename CallTy>
3380struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3381 : public DefaultDOTGraphTraits {
3382 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3383 // If the user requested the full graph to be exported, but provided an
3384 // allocation id, or if the user gave a context id and requested more than
3385 // just a specific context to be exported, note that highlighting is
3386 // enabled.
3387 DoHighlight =
3388 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3389 (ContextIdForDot.getNumOccurrences() &&
3391 }
3392
3393 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3395 using NodeRef = typename GTraits::NodeRef;
3396 using ChildIteratorType = typename GTraits::ChildIteratorType;
3397
3398 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3399 std::string LabelString =
3400 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3401 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3402 .str();
3403 LabelString += "\n";
3404 if (Node->hasCall()) {
3405 auto Func = G->NodeToCallingFunc.find(Node);
3406 assert(Func != G->NodeToCallingFunc.end());
3407 LabelString +=
3408 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3409 } else {
3410 LabelString += "null call";
3411 if (Node->Recursive)
3412 LabelString += " (recursive)";
3413 else
3414 LabelString += " (external)";
3415 }
3416 return LabelString;
3417 }
3418
3420 auto ContextIds = Node->getContextIds();
3421 // If highlighting enabled, see if this node contains any of the context ids
3422 // of interest. If so, it will use a different color and a larger fontsize
3423 // (which makes the node larger as well).
3424 bool Highlight = false;
3425 if (DoHighlight) {
3426 assert(ContextIdForDot.getNumOccurrences() ||
3427 AllocIdForDot.getNumOccurrences());
3428 if (ContextIdForDot.getNumOccurrences())
3429 Highlight = ContextIds.contains(ContextIdForDot);
3430 else
3431 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3432 }
3433 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3434 getContextIds(ContextIds) + "\"")
3435 .str();
3436 // Default fontsize is 14
3437 if (Highlight)
3438 AttributeString += ",fontsize=\"30\"";
3439 AttributeString +=
3440 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3441 .str();
3442 if (Node->CloneOf) {
3443 AttributeString += ",color=\"blue\"";
3444 AttributeString += ",style=\"filled,bold,dashed\"";
3445 } else
3446 AttributeString += ",style=\"filled\"";
3447 return AttributeString;
3448 }
3449
3450 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3451 GraphType G) {
3452 auto &Edge = *(ChildIter.getCurrent());
3453 // If highlighting enabled, see if this edge contains any of the context ids
3454 // of interest. If so, it will use a different color and a heavier arrow
3455 // size and weight (the larger weight makes the highlighted path
3456 // straighter).
3457 bool Highlight = false;
3458 if (DoHighlight) {
3459 assert(ContextIdForDot.getNumOccurrences() ||
3460 AllocIdForDot.getNumOccurrences());
3461 if (ContextIdForDot.getNumOccurrences())
3462 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3463 else
3464 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3465 }
3466 auto Color = getColor(Edge->AllocTypes, Highlight);
3467 std::string AttributeString =
3468 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3469 // fillcolor is the arrow head and color is the line
3470 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3471 "\"")
3472 .str();
3473 if (Edge->IsBackedge)
3474 AttributeString += ",style=\"dotted\"";
3475 // Default penwidth and weight are both 1.
3476 if (Highlight)
3477 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3478 return AttributeString;
3479 }
3480
3481 // Since the NodeOwners list includes nodes that are no longer connected to
3482 // the graph, skip them here.
3484 if (Node->isRemoved())
3485 return true;
3486 // If a scope smaller than the full graph was requested, see if this node
3487 // contains any of the context ids of interest.
3489 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3491 return !Node->getContextIds().contains(ContextIdForDot);
3492 return false;
3493 }
3494
3495private:
3496 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3497 std::string IdString = "ContextIds:";
3498 if (ContextIds.size() < 100) {
3499 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3500 std::sort(SortedIds.begin(), SortedIds.end());
3501 for (auto Id : SortedIds)
3502 IdString += (" " + Twine(Id)).str();
3503 } else {
3504 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3505 }
3506 return IdString;
3507 }
3508
3509 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3510 // If DoHighlight is not enabled, we want to use the highlight colors for
3511 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3512 // both compatible with the color scheme before highlighting was supported,
3513 // and for the NotCold+Cold color the non-highlight color is a bit more
3514 // readable.
3515 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3516 // Color "brown1" actually looks like a lighter red.
3517 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3518 if (AllocTypes == (uint8_t)AllocationType::Cold)
3519 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3520 if (AllocTypes ==
3521 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3522 return Highlight ? "magenta" : "mediumorchid1";
3523 return "gray";
3524 }
3525
3526 static std::string getNodeId(NodeRef Node) {
3527 std::stringstream SStream;
3528 SStream << std::hex << "N0x" << (unsigned long long)Node;
3529 std::string Result = SStream.str();
3530 return Result;
3531 }
3532
3533 // True if we should highlight a specific context or allocation's contexts in
3534 // the emitted graph.
3535 static bool DoHighlight;
3536};
3537
3538template <typename DerivedCCG, typename FuncTy, typename CallTy>
3539bool DOTGraphTraits<
3540 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3541 false;
3542
3543template <typename DerivedCCG, typename FuncTy, typename CallTy>
3544void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3545 std::string Label) const {
3546 WriteGraph(this, "", false, Label,
3547 DotFilePathPrefix + "ccg." + Label + ".dot");
3548}
3549
3550template <typename DerivedCCG, typename FuncTy, typename CallTy>
3551typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3552CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3553 const std::shared_ptr<ContextEdge> &Edge,
3554 DenseSet<uint32_t> ContextIdsToMove) {
3555 ContextNode *Node = Edge->Callee;
3556 assert(NodeToCallingFunc.count(Node));
3557 ContextNode *Clone =
3558 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3559 Node->addClone(Clone);
3560 Clone->MatchingCalls = Node->MatchingCalls;
3561 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3562 ContextIdsToMove);
3563 return Clone;
3564}
3565
3566template <typename DerivedCCG, typename FuncTy, typename CallTy>
3567void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3568 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3569 ContextNode *NewCallee, bool NewClone,
3570 DenseSet<uint32_t> ContextIdsToMove) {
3571 // NewCallee and Edge's current callee must be clones of the same original
3572 // node (Edge's current callee may be the original node too).
3573 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3574
3575 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3576
3577 ContextNode *OldCallee = Edge->Callee;
3578
3579 // We might already have an edge to the new callee from earlier cloning for a
3580 // different allocation. If one exists we will reuse it.
3581 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3582
3583 // Callers will pass an empty ContextIdsToMove set when they want to move the
3584 // edge. Copy in Edge's ids for simplicity.
3585 if (ContextIdsToMove.empty())
3586 ContextIdsToMove = Edge->getContextIds();
3587
3588 // If we are moving all of Edge's ids, then just move the whole Edge.
3589 // Otherwise only move the specified subset, to a new edge if needed.
3590 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3591 // First, update the alloc types on New Callee from Edge.
3592 // Do this before we potentially clear Edge's fields below!
3593 NewCallee->AllocTypes |= Edge->AllocTypes;
3594 // Moving the whole Edge.
3595 if (ExistingEdgeToNewCallee) {
3596 // Since we already have an edge to NewCallee, simply move the ids
3597 // onto it, and remove the existing Edge.
3598 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3599 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3600 assert(Edge->ContextIds == ContextIdsToMove);
3601 removeEdgeFromGraph(Edge.get());
3602 } else {
3603 // Otherwise just reconnect Edge to NewCallee.
3604 Edge->Callee = NewCallee;
3605 NewCallee->CallerEdges.push_back(Edge);
3606 // Remove it from callee where it was previously connected.
3607 OldCallee->eraseCallerEdge(Edge.get());
3608 // Don't need to update Edge's context ids since we are simply
3609 // reconnecting it.
3610 }
3611 } else {
3612 // Only moving a subset of Edge's ids.
3613 // Compute the alloc type of the subset of ids being moved.
3614 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3615 if (ExistingEdgeToNewCallee) {
3616 // Since we already have an edge to NewCallee, simply move the ids
3617 // onto it.
3618 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3619 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3620 } else {
3621 // Otherwise, create a new edge to NewCallee for the ids being moved.
3622 auto NewEdge = std::make_shared<ContextEdge>(
3623 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3624 Edge->Caller->CalleeEdges.push_back(NewEdge);
3625 NewCallee->CallerEdges.push_back(NewEdge);
3626 }
3627 // In either case, need to update the alloc types on NewCallee, and remove
3628 // those ids and update the alloc type on the original Edge.
3629 NewCallee->AllocTypes |= CallerEdgeAllocType;
3630 set_subtract(Edge->ContextIds, ContextIdsToMove);
3631 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3632 }
3633 // Now walk the old callee node's callee edges and move Edge's context ids
3634 // over to the corresponding edge into the clone (which is created here if
3635 // this is a newly created clone).
3636 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3637 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3638 // If this is a direct recursion edge, use NewCallee (the clone) as the
3639 // callee as well, so that any edge updated/created here is also direct
3640 // recursive.
3641 if (CalleeToUse == OldCallee) {
3642 // If this is a recursive edge, see if we already moved a recursive edge
3643 // (which would have to have been this one) - if we were only moving a
3644 // subset of context ids it would still be on OldCallee.
3645 if (EdgeIsRecursive) {
3646 assert(OldCalleeEdge == Edge);
3647 continue;
3648 }
3649 CalleeToUse = NewCallee;
3650 }
3651 // The context ids moving to the new callee are the subset of this edge's
3652 // context ids and the context ids on the caller edge being moved.
3653 DenseSet<uint32_t> EdgeContextIdsToMove =
3654 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3655 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3656 OldCalleeEdge->AllocTypes =
3657 computeAllocType(OldCalleeEdge->getContextIds());
3658 if (!NewClone) {
3659 // Update context ids / alloc type on corresponding edge to NewCallee.
3660 // There is a chance this may not exist if we are reusing an existing
3661 // clone, specifically during function assignment, where we would have
3662 // removed none type edges after creating the clone. If we can't find
3663 // a corresponding edge there, fall through to the cloning below.
3664 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3665 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3666 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3667 continue;
3668 }
3669 }
3670 auto NewEdge = std::make_shared<ContextEdge>(
3671 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3672 EdgeContextIdsToMove);
3673 NewCallee->CalleeEdges.push_back(NewEdge);
3674 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3675 }
3676 // Recompute the node alloc type now that its callee edges have been
3677 // updated (since we will compute from those edges).
3678 OldCallee->AllocTypes = OldCallee->computeAllocType();
3679 // OldCallee alloc type should be None iff its context id set is now empty.
3680 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3681 OldCallee->emptyContextIds());
3682 if (VerifyCCG) {
3683 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3684 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3685 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3686 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3687 /*CheckEdges=*/false);
3688 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3689 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3690 /*CheckEdges=*/false);
3691 }
3692}
3693
3694template <typename DerivedCCG, typename FuncTy, typename CallTy>
3695void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3696 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3697 ContextNode *NewCaller) {
3698 auto *OldCallee = Edge->Callee;
3699 auto *NewCallee = OldCallee;
3700 // If this edge was direct recursive, make any new/updated edge also direct
3701 // recursive to NewCaller.
3702 bool Recursive = Edge->Caller == Edge->Callee;
3703 if (Recursive)
3704 NewCallee = NewCaller;
3705
3706 ContextNode *OldCaller = Edge->Caller;
3707 OldCaller->eraseCalleeEdge(Edge.get());
3708
3709 // We might already have an edge to the new caller. If one exists we will
3710 // reuse it.
3711 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3712
3713 if (ExistingEdgeToNewCaller) {
3714 // Since we already have an edge to NewCaller, simply move the ids
3715 // onto it, and remove the existing Edge.
3716 ExistingEdgeToNewCaller->getContextIds().insert_range(
3717 Edge->getContextIds());
3718 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3719 Edge->ContextIds.clear();
3720 Edge->AllocTypes = (uint8_t)AllocationType::None;
3721 OldCallee->eraseCallerEdge(Edge.get());
3722 } else {
3723 // Otherwise just reconnect Edge to NewCaller.
3724 Edge->Caller = NewCaller;
3725 NewCaller->CalleeEdges.push_back(Edge);
3726 if (Recursive) {
3727 assert(NewCallee == NewCaller);
3728 // In the case of (direct) recursive edges, we update the callee as well
3729 // so that it becomes recursive on the new caller.
3730 Edge->Callee = NewCallee;
3731 NewCallee->CallerEdges.push_back(Edge);
3732 OldCallee->eraseCallerEdge(Edge.get());
3733 }
3734 // Don't need to update Edge's context ids since we are simply
3735 // reconnecting it.
3736 }
3737 // In either case, need to update the alloc types on New Caller.
3738 NewCaller->AllocTypes |= Edge->AllocTypes;
3739
3740 // Now walk the old caller node's caller edges and move Edge's context ids
3741 // over to the corresponding edge into the node (which is created here if
3742 // this is a newly created node). We can tell whether this is a newly created
3743 // node by seeing if it has any caller edges yet.
3744#ifndef NDEBUG
3745 bool IsNewNode = NewCaller->CallerEdges.empty();
3746#endif
3747 // If we just moved a direct recursive edge, presumably its context ids should
3748 // also flow out of OldCaller via some other non-recursive callee edge. We
3749 // don't want to remove the recursive context ids from other caller edges yet,
3750 // otherwise the context ids get into an inconsistent state on OldCaller.
3751 // We will update these context ids on the non-recursive caller edge when and
3752 // if they are updated on the non-recursive callee.
3753 if (!Recursive) {
3754 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3755 auto OldCallerCaller = OldCallerEdge->Caller;
3756 // The context ids moving to the new caller are the subset of this edge's
3757 // context ids and the context ids on the callee edge being moved.
3758 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3759 OldCallerEdge->getContextIds(), Edge->getContextIds());
3760 if (OldCaller == OldCallerCaller) {
3761 OldCallerCaller = NewCaller;
3762 // Don't actually move this one. The caller will move it directly via a
3763 // call to this function with this as the Edge if it is appropriate to
3764 // move to a diff node that has a matching callee (itself).
3765 continue;
3766 }
3767 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3768 OldCallerEdge->AllocTypes =
3769 computeAllocType(OldCallerEdge->getContextIds());
3770 // In this function we expect that any pre-existing node already has edges
3771 // from the same callers as the old node. That should be true in the
3772 // current use case, where we will remove None-type edges after copying
3773 // over all caller edges from the callee.
3774 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3775 // Since we would have skipped caller edges when moving a direct recursive
3776 // edge, this may not hold true when recursive handling enabled.
3777 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3778 if (ExistingCallerEdge) {
3779 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3780 ExistingCallerEdge->AllocTypes |=
3781 computeAllocType(EdgeContextIdsToMove);
3782 continue;
3783 }
3784 auto NewEdge = std::make_shared<ContextEdge>(
3785 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3786 EdgeContextIdsToMove);
3787 NewCaller->CallerEdges.push_back(NewEdge);
3788 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3789 }
3790 }
3791 // Recompute the node alloc type now that its caller edges have been
3792 // updated (since we will compute from those edges).
3793 OldCaller->AllocTypes = OldCaller->computeAllocType();
3794 // OldCaller alloc type should be None iff its context id set is now empty.
3795 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3796 OldCaller->emptyContextIds());
3797 if (VerifyCCG) {
3798 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3799 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3800 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3801 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3802 /*CheckEdges=*/false);
3803 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3804 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3805 /*CheckEdges=*/false);
3806 }
3807}
3808
3809template <typename DerivedCCG, typename FuncTy, typename CallTy>
3810void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3811 recursivelyRemoveNoneTypeCalleeEdges(
3812 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3813 auto Inserted = Visited.insert(Node);
3814 if (!Inserted.second)
3815 return;
3816
3817 removeNoneTypeCalleeEdges(Node);
3818
3819 for (auto *Clone : Node->Clones)
3820 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3821
3822 // The recursive call may remove some of this Node's caller edges.
3823 // Iterate over a copy and skip any that were removed.
3824 auto CallerEdges = Node->CallerEdges;
3825 for (auto &Edge : CallerEdges) {
3826 // Skip any that have been removed by an earlier recursive call.
3827 if (Edge->isRemoved()) {
3828 assert(!is_contained(Node->CallerEdges, Edge));
3829 continue;
3830 }
3831 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3832 }
3833}
3834
3835// This is the standard DFS based backedge discovery algorithm.
3836template <typename DerivedCCG, typename FuncTy, typename CallTy>
3837void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3838 // If we are cloning recursive contexts, find and mark backedges from all root
3839 // callers, using the typical DFS based backedge analysis.
3841 return;
3842 DenseSet<const ContextNode *> Visited;
3843 DenseSet<const ContextNode *> CurrentStack;
3844 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3845 auto *Node = Entry.second;
3846 if (Node->isRemoved())
3847 continue;
3848 // It is a root if it doesn't have callers.
3849 if (!Node->CallerEdges.empty())
3850 continue;
3851 markBackedges(Node, Visited, CurrentStack);
3852 assert(CurrentStack.empty());
3853 }
3854}
3855
3856// Recursive helper for above markBackedges method.
3857template <typename DerivedCCG, typename FuncTy, typename CallTy>
3858void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3859 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3860 DenseSet<const ContextNode *> &CurrentStack) {
3861 auto I = Visited.insert(Node);
3862 // We should only call this for unvisited nodes.
3863 assert(I.second);
3864 (void)I;
3865 for (auto &CalleeEdge : Node->CalleeEdges) {
3866 auto *Callee = CalleeEdge->Callee;
3867 if (Visited.count(Callee)) {
3868 // Since this was already visited we need to check if it is currently on
3869 // the recursive stack in which case it is a backedge.
3870 if (CurrentStack.count(Callee))
3871 CalleeEdge->IsBackedge = true;
3872 continue;
3873 }
3874 CurrentStack.insert(Callee);
3875 markBackedges(Callee, Visited, CurrentStack);
3876 CurrentStack.erase(Callee);
3877 }
3878}
3879
3880template <typename DerivedCCG, typename FuncTy, typename CallTy>
3881void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3882 DenseSet<const ContextNode *> Visited;
3883 for (auto &Entry : AllocationCallToContextNodeMap) {
3884 Visited.clear();
3885 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3886 }
3887 Visited.clear();
3888 for (auto &Entry : AllocationCallToContextNodeMap)
3889 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3890 if (VerifyCCG)
3891 check();
3892}
3893
3894// helper function to check an AllocType is cold or notcold or both.
3901
3902template <typename DerivedCCG, typename FuncTy, typename CallTy>
3903void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3904 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3905 const DenseSet<uint32_t> &AllocContextIds) {
3906 if (VerifyNodes)
3907 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3908 assert(!Node->CloneOf);
3909
3910 // If Node as a null call, then either it wasn't found in the module (regular
3911 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3912 // cloning (e.g. recursion, calls multiple targets, etc).
3913 // Do this here so that we don't try to recursively clone callers below, which
3914 // isn't useful at least for this node.
3915 if (!Node->hasCall())
3916 return;
3917
3918 // No need to look at any callers if allocation type already unambiguous.
3919 if (hasSingleAllocType(Node->AllocTypes))
3920 return;
3921
3922#ifndef NDEBUG
3923 auto Insert =
3924#endif
3925 Visited.insert(Node);
3926 // We should not have visited this node yet.
3927 assert(Insert.second);
3928 // The recursive call to identifyClones may delete the current edge from the
3929 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3930 // in an iterator and having recursive call erase from it. Other edges may
3931 // also get removed during the recursion, which will have null Callee and
3932 // Caller pointers (and are deleted later), so we skip those below.
3933 {
3934 auto CallerEdges = Node->CallerEdges;
3935 for (auto &Edge : CallerEdges) {
3936 // Skip any that have been removed by an earlier recursive call.
3937 if (Edge->isRemoved()) {
3938 assert(!is_contained(Node->CallerEdges, Edge));
3939 continue;
3940 }
3941 // Defer backedges. See comments further below where these edges are
3942 // handled during the cloning of this Node.
3943 if (Edge->IsBackedge) {
3944 // We should only mark these if cloning recursive contexts, where we
3945 // need to do this deferral.
3947 continue;
3948 }
3949 // Ignore any caller we previously visited via another edge.
3950 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3951 identifyClones(Edge->Caller, Visited, AllocContextIds);
3952 }
3953 }
3954 }
3955
3956 // Check if we reached an unambiguous call or have have only a single caller.
3957 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3958 return;
3959
3960 // We need to clone.
3961
3962 // Try to keep the original version as alloc type NotCold. This will make
3963 // cases with indirect calls or any other situation with an unknown call to
3964 // the original function get the default behavior. We do this by sorting the
3965 // CallerEdges of the Node we will clone by alloc type.
3966 //
3967 // Give NotCold edge the lowest sort priority so those edges are at the end of
3968 // the caller edges vector, and stay on the original version (since the below
3969 // code clones greedily until it finds all remaining edges have the same type
3970 // and leaves the remaining ones on the original Node).
3971 //
3972 // We shouldn't actually have any None type edges, so the sorting priority for
3973 // that is arbitrary, and we assert in that case below.
3974 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3975 /*Cold*/ 1,
3976 /*NotColdCold*/ 2};
3977 llvm::stable_sort(Node->CallerEdges,
3978 [&](const std::shared_ptr<ContextEdge> &A,
3979 const std::shared_ptr<ContextEdge> &B) {
3980 // Nodes with non-empty context ids should be sorted
3981 // before those with empty context ids.
3982 if (A->ContextIds.empty())
3983 // Either B ContextIds are non-empty (in which case we
3984 // should return false because B < A), or B ContextIds
3985 // are empty, in which case they are equal, and we
3986 // should maintain the original relative ordering.
3987 return false;
3988 if (B->ContextIds.empty())
3989 return true;
3990
3991 if (A->AllocTypes == B->AllocTypes)
3992 // Use the first context id for each edge as a
3993 // tie-breaker.
3994 return *A->ContextIds.begin() < *B->ContextIds.begin();
3995 return AllocTypeCloningPriority[A->AllocTypes] <
3996 AllocTypeCloningPriority[B->AllocTypes];
3997 });
3998
3999 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4000
4001 DenseSet<uint32_t> RecursiveContextIds;
4003 // If we are allowing recursive callsites, but have also disabled recursive
4004 // contexts, look for context ids that show up in multiple caller edges.
4006 DenseSet<uint32_t> AllCallerContextIds;
4007 for (auto &CE : Node->CallerEdges) {
4008 // Resize to the largest set of caller context ids, since we know the
4009 // final set will be at least that large.
4010 AllCallerContextIds.reserve(CE->getContextIds().size());
4011 for (auto Id : CE->getContextIds())
4012 if (!AllCallerContextIds.insert(Id).second)
4013 RecursiveContextIds.insert(Id);
4014 }
4015 }
4016
4017 // Iterate until we find no more opportunities for disambiguating the alloc
4018 // types via cloning. In most cases this loop will terminate once the Node
4019 // has a single allocation type, in which case no more cloning is needed.
4020 // Iterate over a copy of Node's caller edges, since we may need to remove
4021 // edges in the moveEdgeTo* methods, and this simplifies the handling and
4022 // makes it less error-prone.
4023 auto CallerEdges = Node->CallerEdges;
4024 for (auto &CallerEdge : CallerEdges) {
4025 // Skip any that have been removed by an earlier recursive call.
4026 if (CallerEdge->isRemoved()) {
4027 assert(!is_contained(Node->CallerEdges, CallerEdge));
4028 continue;
4029 }
4030 assert(CallerEdge->Callee == Node);
4031
4032 // See if cloning the prior caller edge left this node with a single alloc
4033 // type or a single caller. In that case no more cloning of Node is needed.
4034 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4035 break;
4036
4037 // If the caller was not successfully matched to a call in the IR/summary,
4038 // there is no point in trying to clone for it as we can't update that call.
4039 if (!CallerEdge->Caller->hasCall())
4040 continue;
4041
4042 // Only need to process the ids along this edge pertaining to the given
4043 // allocation.
4044 auto CallerEdgeContextsForAlloc =
4045 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
4046 if (!RecursiveContextIds.empty())
4047 CallerEdgeContextsForAlloc =
4048 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
4049 if (CallerEdgeContextsForAlloc.empty())
4050 continue;
4051
4052 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4053
4054 // Compute the node callee edge alloc types corresponding to the context ids
4055 // for this caller edge.
4056 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
4057 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
4058 for (auto &CalleeEdge : Node->CalleeEdges)
4059 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4060 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4061
4062 // Don't clone if doing so will not disambiguate any alloc types amongst
4063 // caller edges (including the callee edges that would be cloned).
4064 // Otherwise we will simply move all edges to the clone.
4065 //
4066 // First check if by cloning we will disambiguate the caller allocation
4067 // type from node's allocation type. Query allocTypeToUse so that we don't
4068 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
4069 // neither of these should be None type.
4070 //
4071 // Then check if by cloning node at least one of the callee edges will be
4072 // disambiguated by splitting out different context ids.
4073 //
4074 // However, always do the cloning if this is a backedge, in which case we
4075 // have not yet cloned along this caller edge.
4076 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
4077 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4078 if (!CallerEdge->IsBackedge &&
4079 allocTypeToUse(CallerAllocTypeForAlloc) ==
4080 allocTypeToUse(Node->AllocTypes) &&
4081 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
4082 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
4083 continue;
4084 }
4085
4086 if (CallerEdge->IsBackedge) {
4087 // We should only mark these if cloning recursive contexts, where we
4088 // need to do this deferral.
4090 DeferredBackedges++;
4091 }
4092
4093 // If this is a backedge, we now do recursive cloning starting from its
4094 // caller since we may have moved unambiguous caller contexts to a clone
4095 // of this Node in a previous iteration of the current loop, giving more
4096 // opportunity for cloning through the backedge. Because we sorted the
4097 // caller edges earlier so that cold caller edges are first, we would have
4098 // visited and cloned this node for any unamibiguously cold non-recursive
4099 // callers before any ambiguous backedge callers. Note that we don't do this
4100 // if the caller is already cloned or visited during cloning (e.g. via a
4101 // different context path from the allocation).
4102 // TODO: Can we do better in the case where the caller was already visited?
4103 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
4104 !Visited.count(CallerEdge->Caller)) {
4105 const auto OrigIdCount = CallerEdge->getContextIds().size();
4106 // Now do the recursive cloning of this backedge's caller, which was
4107 // deferred earlier.
4108 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
4109 removeNoneTypeCalleeEdges(CallerEdge->Caller);
4110 // See if the recursive call to identifyClones moved the context ids to a
4111 // new edge from this node to a clone of caller, and switch to looking at
4112 // that new edge so that we clone Node for the new caller clone.
4113 bool UpdatedEdge = false;
4114 if (OrigIdCount > CallerEdge->getContextIds().size()) {
4115 for (auto E : Node->CallerEdges) {
4116 // Only interested in clones of the current edges caller.
4117 if (E->Caller->CloneOf != CallerEdge->Caller)
4118 continue;
4119 // See if this edge contains any of the context ids originally on the
4120 // current caller edge.
4121 auto CallerEdgeContextsForAllocNew =
4122 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
4123 if (CallerEdgeContextsForAllocNew.empty())
4124 continue;
4125 // Make sure we don't pick a previously existing caller edge of this
4126 // Node, which would be processed on a different iteration of the
4127 // outer loop over the saved CallerEdges.
4128 if (llvm::is_contained(CallerEdges, E))
4129 continue;
4130 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
4131 // are updated further below for all cases where we just invoked
4132 // identifyClones recursively.
4133 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
4134 CallerEdge = E;
4135 UpdatedEdge = true;
4136 break;
4137 }
4138 }
4139 // If cloning removed this edge (and we didn't update it to a new edge
4140 // above), we're done with this edge. It's possible we moved all of the
4141 // context ids to an existing clone, in which case there's no need to do
4142 // further processing for them.
4143 if (CallerEdge->isRemoved())
4144 continue;
4145
4146 // Now we need to update the information used for the cloning decisions
4147 // further below, as we may have modified edges and their context ids.
4148
4149 // Note if we changed the CallerEdge above we would have already updated
4150 // the context ids.
4151 if (!UpdatedEdge) {
4152 CallerEdgeContextsForAlloc = set_intersection(
4153 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
4154 if (CallerEdgeContextsForAlloc.empty())
4155 continue;
4156 }
4157 // Update the other information that depends on the edges and on the now
4158 // updated CallerEdgeContextsForAlloc.
4159 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4160 CalleeEdgeAllocTypesForCallerEdge.clear();
4161 for (auto &CalleeEdge : Node->CalleeEdges) {
4162 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4163 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4164 }
4165 }
4166
4167 // First see if we can use an existing clone. Check each clone and its
4168 // callee edges for matching alloc types.
4169 ContextNode *Clone = nullptr;
4170 for (auto *CurClone : Node->Clones) {
4171 if (allocTypeToUse(CurClone->AllocTypes) !=
4172 allocTypeToUse(CallerAllocTypeForAlloc))
4173 continue;
4174
4175 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
4176 hasSingleAllocType(CallerAllocTypeForAlloc);
4177 // The above check should mean that if both have single alloc types that
4178 // they should be equal.
4179 assert(!BothSingleAlloc ||
4180 CurClone->AllocTypes == CallerAllocTypeForAlloc);
4181
4182 // If either both have a single alloc type (which are the same), or if the
4183 // clone's callee edges have the same alloc types as those for the current
4184 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
4185 // then we can reuse this clone.
4186 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
4187 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
4188 Clone = CurClone;
4189 break;
4190 }
4191 }
4192
4193 // The edge iterator is adjusted when we move the CallerEdge to the clone.
4194 if (Clone)
4195 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
4196 CallerEdgeContextsForAlloc);
4197 else
4198 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
4199
4200 // Sanity check that no alloc types on clone or its edges are None.
4201 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
4202 }
4203
4204 // We should still have some context ids on the original Node.
4205 assert(!Node->emptyContextIds());
4206
4207 // Sanity check that no alloc types on node or edges are None.
4208 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4209
4210 if (VerifyNodes)
4211 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
4212}
4213
4214void ModuleCallsiteContextGraph::updateAllocationCall(
4215 CallInfo &Call, AllocationType AllocType) {
4216 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
4218 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
4219 "memprof", AllocTypeString);
4220 cast<CallBase>(Call.call())->addFnAttr(A);
4221 OREGetter(Call.call()->getFunction())
4222 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
4223 << ore::NV("AllocationCall", Call.call()) << " in clone "
4224 << ore::NV("Caller", Call.call()->getFunction())
4225 << " marked with memprof allocation attribute "
4226 << ore::NV("Attribute", AllocTypeString));
4227}
4228
4229void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4231 auto *AI = cast<AllocInfo *>(Call.call());
4232 assert(AI);
4233 assert(AI->Versions.size() > Call.cloneNo());
4234 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4235}
4236
4238ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4239 const auto *CB = cast<CallBase>(Call.call());
4240 if (!CB->getAttributes().hasFnAttr("memprof"))
4241 return AllocationType::None;
4242 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4243 ? AllocationType::Cold
4244 : AllocationType::NotCold;
4245}
4246
4248IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4249 const auto *AI = cast<AllocInfo *>(Call.call());
4250 assert(AI->Versions.size() > Call.cloneNo());
4251 return (AllocationType)AI->Versions[Call.cloneNo()];
4252}
4253
4254void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4255 FuncInfo CalleeFunc) {
4256 auto *CurF = getCalleeFunc(CallerCall.call());
4257 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4258 if (isMemProfClone(*CurF)) {
4259 // If we already assigned this callsite to call a specific non-default
4260 // clone (i.e. not the original function which is clone 0), ensure that we
4261 // aren't trying to now update it to call a different clone, which is
4262 // indicative of a bug in the graph or function assignment.
4263 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4264 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4265 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4266 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4267 << "\n");
4268 MismatchedCloneAssignments++;
4269 }
4270 }
4271 if (NewCalleeCloneNo > 0)
4272 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4273 OREGetter(CallerCall.call()->getFunction())
4274 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4275 << ore::NV("Call", CallerCall.call()) << " in clone "
4276 << ore::NV("Caller", CallerCall.call()->getFunction())
4277 << " assigned to call function clone "
4278 << ore::NV("Callee", CalleeFunc.func()));
4279}
4280
4281void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4282 FuncInfo CalleeFunc) {
4283 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4284 assert(CI &&
4285 "Caller cannot be an allocation which should not have profiled calls");
4286 assert(CI->Clones.size() > CallerCall.cloneNo());
4287 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4288 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4289 // If we already assigned this callsite to call a specific non-default
4290 // clone (i.e. not the original function which is clone 0), ensure that we
4291 // aren't trying to now update it to call a different clone, which is
4292 // indicative of a bug in the graph or function assignment.
4293 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4294 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4295 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4296 << "\n");
4297 MismatchedCloneAssignments++;
4298 }
4299 CurCalleeCloneNo = NewCalleeCloneNo;
4300}
4301
4302// Update the debug information attached to NewFunc to use the clone Name. Note
4303// this needs to be done for both any existing DISubprogram for the definition,
4304// as well as any separate declaration DISubprogram.
4306 assert(Name == NewFunc->getName());
4307 auto *SP = NewFunc->getSubprogram();
4308 if (!SP)
4309 return;
4310 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4311 SP->replaceLinkageName(MDName);
4312 DISubprogram *Decl = SP->getDeclaration();
4313 if (!Decl)
4314 return;
4315 TempDISubprogram NewDecl = Decl->clone();
4316 NewDecl->replaceLinkageName(MDName);
4317 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4318}
4319
4320CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4321 Instruction *>::FuncInfo
4322ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4323 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4324 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4325 // Use existing LLVM facilities for cloning and obtaining Call in clone
4326 ValueToValueMapTy VMap;
4327 auto *NewFunc = CloneFunction(Func.func(), VMap);
4328 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4329 assert(!Func.func()->getParent()->getFunction(Name));
4330 NewFunc->setName(Name);
4331 updateSubprogramLinkageName(NewFunc, Name);
4332 for (auto &Inst : CallsWithMetadataInFunc) {
4333 // This map always has the initial version in it.
4334 assert(Inst.cloneNo() == 0);
4335 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4336 }
4337 OREGetter(Func.func())
4338 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4339 << "created clone " << ore::NV("NewFunction", NewFunc));
4340 return {NewFunc, CloneNo};
4341}
4342
4343CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4344 IndexCall>::FuncInfo
4345IndexCallsiteContextGraph::cloneFunctionForCallsite(
4346 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4347 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4348 // Check how many clones we have of Call (and therefore function).
4349 // The next clone number is the current size of versions array.
4350 // Confirm this matches the CloneNo provided by the caller, which is based on
4351 // the number of function clones we have.
4352 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4353 ? cast<AllocInfo *>(Call.call())->Versions.size()
4354 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4355 // Walk all the instructions in this function. Create a new version for
4356 // each (by adding an entry to the Versions/Clones summary array), and copy
4357 // over the version being called for the function clone being cloned here.
4358 // Additionally, add an entry to the CallMap for the new function clone,
4359 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4360 // to the new call clone.
4361 for (auto &Inst : CallsWithMetadataInFunc) {
4362 // This map always has the initial version in it.
4363 assert(Inst.cloneNo() == 0);
4364 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4365 assert(AI->Versions.size() == CloneNo);
4366 // We assign the allocation type later (in updateAllocationCall), just add
4367 // an entry for it here.
4368 AI->Versions.push_back(0);
4369 } else {
4370 auto *CI = cast<CallsiteInfo *>(Inst.call());
4371 assert(CI && CI->Clones.size() == CloneNo);
4372 // We assign the clone number later (in updateCall), just add an entry for
4373 // it here.
4374 CI->Clones.push_back(0);
4375 }
4376 CallMap[Inst] = {Inst.call(), CloneNo};
4377 }
4378 return {Func.func(), CloneNo};
4379}
4380
4381// We perform cloning for each allocation node separately. However, this
4382// sometimes results in a situation where the same node calls multiple
4383// clones of the same callee, created for different allocations. This
4384// causes issues when assigning functions to these clones, as each node can
4385// in reality only call a single callee clone.
4386//
4387// To address this, before assigning functions, merge callee clone nodes as
4388// needed using a post order traversal from the allocations. We attempt to
4389// use existing clones as the merge node when legal, and to share them
4390// among callers with the same properties (callers calling the same set of
4391// callee clone nodes for the same allocations).
4392//
4393// Without this fix, in some cases incorrect function assignment will lead
4394// to calling the wrong allocation clone.
4395template <typename DerivedCCG, typename FuncTy, typename CallTy>
4396void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4397 if (!MergeClones)
4398 return;
4399
4400 // Generate a map from context id to the associated allocation node for use
4401 // when merging clones.
4402 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4403 for (auto &Entry : AllocationCallToContextNodeMap) {
4404 auto *Node = Entry.second;
4405 for (auto Id : Node->getContextIds())
4406 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4407 for (auto *Clone : Node->Clones) {
4408 for (auto Id : Clone->getContextIds())
4409 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4410 }
4411 }
4412
4413 // Post order traversal starting from allocations to ensure each callsite
4414 // calls a single clone of its callee. Callee nodes that are clones of each
4415 // other are merged (via new merge nodes if needed) to achieve this.
4416 DenseSet<const ContextNode *> Visited;
4417 for (auto &Entry : AllocationCallToContextNodeMap) {
4418 auto *Node = Entry.second;
4419
4420 mergeClones(Node, Visited, ContextIdToAllocationNode);
4421
4422 // Make a copy so the recursive post order traversal that may create new
4423 // clones doesn't mess up iteration. Note that the recursive traversal
4424 // itself does not call mergeClones on any of these nodes, which are all
4425 // (clones of) allocations.
4426 auto Clones = Node->Clones;
4427 for (auto *Clone : Clones)
4428 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4429 }
4430
4431 if (DumpCCG) {
4432 dbgs() << "CCG after merging:\n";
4433 dbgs() << *this;
4434 }
4435 if (ExportToDot)
4436 exportToDot("aftermerge");
4437
4438 if (VerifyCCG) {
4439 check();
4440 }
4441}
4442
4443// Recursive helper for above mergeClones method.
4444template <typename DerivedCCG, typename FuncTy, typename CallTy>
4445void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4446 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4447 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4448 auto Inserted = Visited.insert(Node);
4449 if (!Inserted.second)
4450 return;
4451
4452 // Iteratively perform merging on this node to handle new caller nodes created
4453 // during the recursive traversal. We could do something more elegant such as
4454 // maintain a worklist, but this is a simple approach that doesn't cause a
4455 // measureable compile time effect, as most nodes don't have many caller
4456 // edges to check.
4457 bool FoundUnvisited = true;
4458 unsigned Iters = 0;
4459 while (FoundUnvisited) {
4460 Iters++;
4461 FoundUnvisited = false;
4462 // Make a copy since the recursive call may move a caller edge to a new
4463 // callee, messing up the iterator.
4464 auto CallerEdges = Node->CallerEdges;
4465 for (auto CallerEdge : CallerEdges) {
4466 // Skip any caller edge moved onto a different callee during recursion.
4467 if (CallerEdge->Callee != Node)
4468 continue;
4469 // If we found an unvisited caller, note that we should check the caller
4470 // edges again as mergeClones may add or change caller nodes.
4471 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4472 FoundUnvisited = true;
4473 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4474 }
4475 }
4476
4477 TotalMergeInvokes++;
4478 TotalMergeIters += Iters;
4479 if (Iters > MaxMergeIters)
4480 MaxMergeIters = Iters;
4481
4482 // Merge for this node after we handle its callers.
4483 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4484}
4485
4486template <typename DerivedCCG, typename FuncTy, typename CallTy>
4487void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4488 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4489 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4490 // Ignore Node if we moved all of its contexts to clones.
4491 if (Node->emptyContextIds())
4492 return;
4493
4494 // First identify groups of clones among Node's callee edges, by building
4495 // a map from each callee base node to the associated callee edges from Node.
4496 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4497 OrigNodeToCloneEdges;
4498 for (const auto &E : Node->CalleeEdges) {
4499 auto *Callee = E->Callee;
4500 if (!Callee->CloneOf && Callee->Clones.empty())
4501 continue;
4502 ContextNode *Base = Callee->getOrigNode();
4503 OrigNodeToCloneEdges[Base].push_back(E);
4504 }
4505
4506 // Helper for callee edge sorting below. Return true if A's callee has fewer
4507 // caller edges than B, or if A is a clone and B is not, or if A's first
4508 // context id is smaller than B's.
4509 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4510 const std::shared_ptr<ContextEdge> &B) {
4511 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4512 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4513 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4514 return true;
4515 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4516 return false;
4517 // Use the first context id for each edge as a
4518 // tie-breaker.
4519 return *A->ContextIds.begin() < *B->ContextIds.begin();
4520 };
4521
4522 // Process each set of callee clones called by Node, performing the needed
4523 // merging.
4524 for (auto Entry : OrigNodeToCloneEdges) {
4525 // CalleeEdges is the set of edges from Node reaching callees that are
4526 // mutual clones of each other.
4527 auto &CalleeEdges = Entry.second;
4528 auto NumCalleeClones = CalleeEdges.size();
4529 // A single edge means there is no merging needed.
4530 if (NumCalleeClones == 1)
4531 continue;
4532 // Sort the CalleeEdges calling this group of clones in ascending order of
4533 // their caller edge counts, putting the original non-clone node first in
4534 // cases of a tie. This simplifies finding an existing node to use as the
4535 // merge node.
4536 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4537
4538 /// Find other callers of the given set of callee edges that can
4539 /// share the same callee merge node. See the comments at this method
4540 /// definition for details.
4541 DenseSet<ContextNode *> OtherCallersToShareMerge;
4542 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4543 OtherCallersToShareMerge);
4544
4545 // Now do the actual merging. Identify existing or create a new MergeNode
4546 // during the first iteration. Move each callee over, along with edges from
4547 // other callers we've determined above can share the same merge node.
4548 ContextNode *MergeNode = nullptr;
4549 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4550 for (auto CalleeEdge : CalleeEdges) {
4551 auto *OrigCallee = CalleeEdge->Callee;
4552 // If we don't have a MergeNode yet (only happens on the first iteration,
4553 // as a new one will be created when we go to move the first callee edge
4554 // over as needed), see if we can use this callee.
4555 if (!MergeNode) {
4556 // If there are no other callers, simply use this callee.
4557 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4558 MergeNode = OrigCallee;
4559 NonNewMergedNodes++;
4560 continue;
4561 }
4562 // Otherwise, if we have identified other caller nodes that can share
4563 // the merge node with Node, see if all of OrigCallee's callers are
4564 // going to share the same merge node. In that case we can use callee
4565 // (since all of its callers would move to the new merge node).
4566 if (!OtherCallersToShareMerge.empty()) {
4567 bool MoveAllCallerEdges = true;
4568 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4569 if (CalleeCallerE == CalleeEdge)
4570 continue;
4571 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4572 MoveAllCallerEdges = false;
4573 break;
4574 }
4575 }
4576 // If we are going to move all callers over, we can use this callee as
4577 // the MergeNode.
4578 if (MoveAllCallerEdges) {
4579 MergeNode = OrigCallee;
4580 NonNewMergedNodes++;
4581 continue;
4582 }
4583 }
4584 }
4585 // Move this callee edge, creating a new merge node if necessary.
4586 if (MergeNode) {
4587 assert(MergeNode != OrigCallee);
4588 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4589 /*NewClone*/ false);
4590 } else {
4591 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4592 NewMergedNodes++;
4593 }
4594 // Now move all identified edges from other callers over to the merge node
4595 // as well.
4596 if (!OtherCallersToShareMerge.empty()) {
4597 // Make and iterate over a copy of OrigCallee's caller edges because
4598 // some of these will be moved off of the OrigCallee and that would mess
4599 // up the iteration from OrigCallee.
4600 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4601 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4602 if (CalleeCallerE == CalleeEdge)
4603 continue;
4604 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4605 continue;
4606 CallerToMoveCount[CalleeCallerE->Caller]++;
4607 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4608 /*NewClone*/ false);
4609 }
4610 }
4611 removeNoneTypeCalleeEdges(OrigCallee);
4612 removeNoneTypeCalleeEdges(MergeNode);
4613 }
4614 }
4615}
4616
4617// Look for other nodes that have edges to the same set of callee
4618// clones as the current Node. Those can share the eventual merge node
4619// (reducing cloning and binary size overhead) iff:
4620// - they have edges to the same set of callee clones
4621// - each callee edge reaches a subset of the same allocations as Node's
4622// corresponding edge to the same callee clone.
4623// The second requirement is to ensure that we don't undo any of the
4624// necessary cloning to distinguish contexts with different allocation
4625// behavior.
4626// FIXME: This is somewhat conservative, as we really just need to ensure
4627// that they don't reach the same allocations as contexts on edges from Node
4628// going to any of the *other* callee clones being merged. However, that
4629// requires more tracking and checking to get right.
4630template <typename DerivedCCG, typename FuncTy, typename CallTy>
4631void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4632 findOtherCallersToShareMerge(
4633 ContextNode *Node,
4634 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4635 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4636 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4637 auto NumCalleeClones = CalleeEdges.size();
4638 // This map counts how many edges to the same callee clone exist for other
4639 // caller nodes of each callee clone.
4640 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4641 // Counts the number of other caller nodes that have edges to all callee
4642 // clones that don't violate the allocation context checking.
4643 unsigned PossibleOtherCallerNodes = 0;
4644
4645 // We only need to look at other Caller nodes if the first callee edge has
4646 // multiple callers (recall they are sorted in ascending order above).
4647 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4648 return;
4649
4650 // For each callee edge:
4651 // - Collect the count of other caller nodes calling the same callees.
4652 // - Collect the alloc nodes reached by contexts on each callee edge.
4653 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4654 for (auto CalleeEdge : CalleeEdges) {
4655 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4656 // For each other caller of the same callee, increment the count of
4657 // edges reaching the same callee clone.
4658 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4659 if (CalleeCallerEdges->Caller == Node) {
4660 assert(CalleeCallerEdges == CalleeEdge);
4661 continue;
4662 }
4663 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4664 // If this caller edge now reaches all of the same callee clones,
4665 // increment the count of candidate other caller nodes.
4666 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4667 NumCalleeClones)
4668 PossibleOtherCallerNodes++;
4669 }
4670 // Collect the alloc nodes reached by contexts on each callee edge, for
4671 // later analysis.
4672 for (auto Id : CalleeEdge->getContextIds()) {
4673 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4674 if (!Alloc) {
4675 // FIXME: unclear why this happens occasionally, presumably
4676 // imperfect graph updates possibly with recursion.
4677 MissingAllocForContextId++;
4678 continue;
4679 }
4680 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4681 }
4682 }
4683
4684 // Now walk the callee edges again, and make sure that for each candidate
4685 // caller node all of its edges to the callees reach the same allocs (or
4686 // a subset) as those along the corresponding callee edge from Node.
4687 for (auto CalleeEdge : CalleeEdges) {
4688 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4689 // Stop if we do not have any (more) candidate other caller nodes.
4690 if (!PossibleOtherCallerNodes)
4691 break;
4692 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4693 // Check each other caller of this callee clone.
4694 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4695 // Not interested in the callee edge from Node itself.
4696 if (CalleeCallerE == CalleeEdge)
4697 continue;
4698 // Skip any callers that didn't have callee edges to all the same
4699 // callee clones.
4700 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4701 NumCalleeClones)
4702 continue;
4703 // Make sure that each context along edge from candidate caller node
4704 // reaches an allocation also reached by this callee edge from Node.
4705 for (auto Id : CalleeCallerE->getContextIds()) {
4706 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4707 if (!Alloc)
4708 continue;
4709 // If not, simply reset the map entry to 0 so caller is ignored, and
4710 // reduce the count of candidate other caller nodes.
4711 if (!CurCalleeAllocNodes.contains(Alloc)) {
4712 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4713 PossibleOtherCallerNodes--;
4714 break;
4715 }
4716 }
4717 }
4718 }
4719
4720 if (!PossibleOtherCallerNodes)
4721 return;
4722
4723 // Build the set of other caller nodes that can use the same callee merge
4724 // node.
4725 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4726 if (Count != NumCalleeClones)
4727 continue;
4728 OtherCallersToShareMerge.insert(OtherCaller);
4729 }
4730}
4731
4732// This method assigns cloned callsites to functions, cloning the functions as
4733// needed. The assignment is greedy and proceeds roughly as follows:
4734//
4735// For each function Func:
4736// For each call with graph Node having clones:
4737// Initialize ClonesWorklist to Node and its clones
4738// Initialize NodeCloneCount to 0
4739// While ClonesWorklist is not empty:
4740// Clone = pop front ClonesWorklist
4741// NodeCloneCount++
4742// If Func has been cloned less than NodeCloneCount times:
4743// If NodeCloneCount is 1:
4744// Assign Clone to original Func
4745// Continue
4746// Create a new function clone
4747// If other callers not assigned to call a function clone yet:
4748// Assign them to call new function clone
4749// Continue
4750// Assign any other caller calling the cloned version to new clone
4751//
4752// For each caller of Clone:
4753// If caller is assigned to call a specific function clone:
4754// If we cannot assign Clone to that function clone:
4755// Create new callsite Clone NewClone
4756// Add NewClone to ClonesWorklist
4757// Continue
4758// Assign Clone to existing caller's called function clone
4759// Else:
4760// If Clone not already assigned to a function clone:
4761// Assign to first function clone without assignment
4762// Assign caller to selected function clone
4763// For each call with graph Node having clones:
4764// If number func clones > number call's callsite Node clones:
4765// Record func CallInfo clones without Node clone in UnassignedCallClones
4766// For callsite Nodes in DFS order from allocations:
4767// If IsAllocation:
4768// Update allocation with alloc type
4769// Else:
4770// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4771// Update call to call recorded callee clone
4772//
4773template <typename DerivedCCG, typename FuncTy, typename CallTy>
4774bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4775 bool Changed = false;
4776
4777 mergeClones();
4778
4779 // Keep track of the assignment of nodes (callsites) to function clones they
4780 // call.
4781 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4782
4783 // Update caller node to call function version CalleeFunc, by recording the
4784 // assignment in CallsiteToCalleeFuncCloneMap.
4785 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4786 const FuncInfo &CalleeFunc) {
4787 assert(Caller->hasCall());
4788 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4789 };
4790
4791 // Information for a single clone of this Func.
4792 struct FuncCloneInfo {
4793 // The function clone.
4794 FuncInfo FuncClone;
4795 // Remappings of each call of interest (from original uncloned call to the
4796 // corresponding cloned call in this function clone).
4797 DenseMap<CallInfo, CallInfo> CallMap;
4798 };
4799
4800 // Map to keep track of information needed to update calls in function clones
4801 // when their corresponding callsite node was not itself cloned for that
4802 // function clone. Because of call context pruning (i.e. we only keep as much
4803 // caller information as needed to distinguish hot vs cold), we may not have
4804 // caller edges coming to each callsite node from all possible function
4805 // callers. A function clone may get created for other callsites in the
4806 // function for which there are caller edges that were not pruned. Any other
4807 // callsites in that function clone, which were not themselved cloned for
4808 // that function clone, should get updated the same way as the corresponding
4809 // callsite in the original function (which may call a clone of its callee).
4810 //
4811 // We build this map after completing function cloning for each function, so
4812 // that we can record the information from its call maps before they are
4813 // destructed. The map will be used as we update calls to update any still
4814 // unassigned call clones. Note that we may create new node clones as we clone
4815 // other functions, so later on we check which node clones were still not
4816 // created. To this end, the inner map is a map from function clone number to
4817 // the list of calls cloned for that function (can be more than one due to the
4818 // Node's MatchingCalls array).
4819 //
4820 // The alternative is creating new callsite clone nodes below as we clone the
4821 // function, but that is tricker to get right and likely more overhead.
4822 //
4823 // Inner map is a std::map so sorted by key (clone number), in order to get
4824 // ordered remarks in the full LTO case.
4825 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4826 UnassignedCallClones;
4827
4828 // Walk all functions for which we saw calls with memprof metadata, and handle
4829 // cloning for each of its calls.
4830 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4831 FuncInfo OrigFunc(Func);
4832 // Map from each clone number of OrigFunc to information about that function
4833 // clone (the function clone FuncInfo and call remappings). The index into
4834 // the vector is the clone number, as function clones are created and
4835 // numbered sequentially.
4836 std::vector<FuncCloneInfo> FuncCloneInfos;
4837 for (auto &Call : CallsWithMetadata) {
4838 ContextNode *Node = getNodeForInst(Call);
4839 // Skip call if we do not have a node for it (all uses of its stack ids
4840 // were either on inlined chains or pruned from the MIBs), or if we did
4841 // not create any clones for it.
4842 if (!Node || Node->Clones.empty())
4843 continue;
4844 assert(Node->hasCall() &&
4845 "Not having a call should have prevented cloning");
4846
4847 // Track the assignment of function clones to clones of the current
4848 // callsite Node being handled.
4849 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4850
4851 // Assign callsite version CallsiteClone to function version FuncClone,
4852 // and also assign (possibly cloned) Call to CallsiteClone.
4853 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4854 CallInfo &Call,
4855 ContextNode *CallsiteClone,
4856 bool IsAlloc) {
4857 // Record the clone of callsite node assigned to this function clone.
4858 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4859
4860 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4861 DenseMap<CallInfo, CallInfo> &CallMap =
4862 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4863 CallInfo CallClone(Call);
4864 if (auto It = CallMap.find(Call); It != CallMap.end())
4865 CallClone = It->second;
4866 CallsiteClone->setCall(CallClone);
4867 // Need to do the same for all matching calls.
4868 for (auto &MatchingCall : Node->MatchingCalls) {
4869 CallInfo CallClone(MatchingCall);
4870 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4871 CallClone = It->second;
4872 // Updates the call in the list.
4873 MatchingCall = CallClone;
4874 }
4875 };
4876
4877 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4878 // performs the necessary fixups (removing none type edges, and
4879 // importantly, propagating any function call assignment of the original
4880 // node to the new clone).
4881 auto MoveEdgeToNewCalleeCloneAndSetUp =
4882 [&](const std::shared_ptr<ContextEdge> &Edge) {
4883 ContextNode *OrigCallee = Edge->Callee;
4884 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4885 removeNoneTypeCalleeEdges(NewClone);
4886 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4887 // If the original Callee was already assigned to call a specific
4888 // function version, make sure its new clone is assigned to call
4889 // that same function clone.
4890 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4891 RecordCalleeFuncOfCallsite(
4892 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4893 return NewClone;
4894 };
4895
4896 // Keep track of the clones of callsite Node that need to be assigned to
4897 // function clones. This list may be expanded in the loop body below if we
4898 // find additional cloning is required.
4899 std::deque<ContextNode *> ClonesWorklist;
4900 // Ignore original Node if we moved all of its contexts to clones.
4901 if (!Node->emptyContextIds())
4902 ClonesWorklist.push_back(Node);
4903 llvm::append_range(ClonesWorklist, Node->Clones);
4904
4905 // Now walk through all of the clones of this callsite Node that we need,
4906 // and determine the assignment to a corresponding clone of the current
4907 // function (creating new function clones as needed).
4908 unsigned NodeCloneCount = 0;
4909 while (!ClonesWorklist.empty()) {
4910 ContextNode *Clone = ClonesWorklist.front();
4911 ClonesWorklist.pop_front();
4912 NodeCloneCount++;
4913 if (VerifyNodes)
4915
4916 // Need to create a new function clone if we have more callsite clones
4917 // than existing function clones, which would have been assigned to an
4918 // earlier clone in the list (we assign callsite clones to function
4919 // clones greedily).
4920 if (FuncCloneInfos.size() < NodeCloneCount) {
4921 // If this is the first callsite copy, assign to original function.
4922 if (NodeCloneCount == 1) {
4923 // Since FuncCloneInfos is empty in this case, no clones have
4924 // been created for this function yet, and no callers should have
4925 // been assigned a function clone for this callee node yet.
4927 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4928 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4929 }));
4930 // Initialize with empty call map, assign Clone to original function
4931 // and its callers, and skip to the next clone.
4932 FuncCloneInfos.push_back(
4933 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4934 AssignCallsiteCloneToFuncClone(
4935 OrigFunc, Call, Clone,
4936 AllocationCallToContextNodeMap.count(Call));
4937 for (auto &CE : Clone->CallerEdges) {
4938 // Ignore any caller that does not have a recorded callsite Call.
4939 if (!CE->Caller->hasCall())
4940 continue;
4941 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4942 }
4943 continue;
4944 }
4945
4946 // First locate which copy of OrigFunc to clone again. If a caller
4947 // of this callsite clone was already assigned to call a particular
4948 // function clone, we need to redirect all of those callers to the
4949 // new function clone, and update their other callees within this
4950 // function.
4951 FuncInfo PreviousAssignedFuncClone;
4952 auto EI = llvm::find_if(
4953 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4954 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4955 });
4956 bool CallerAssignedToCloneOfFunc = false;
4957 if (EI != Clone->CallerEdges.end()) {
4958 const std::shared_ptr<ContextEdge> &Edge = *EI;
4959 PreviousAssignedFuncClone =
4960 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4961 CallerAssignedToCloneOfFunc = true;
4962 }
4963
4964 // Clone function and save it along with the CallInfo map created
4965 // during cloning in the FuncCloneInfos.
4966 DenseMap<CallInfo, CallInfo> NewCallMap;
4967 unsigned CloneNo = FuncCloneInfos.size();
4968 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4969 "should already exist in the map");
4970 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4971 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
4972 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
4973 FunctionClonesAnalysis++;
4974 Changed = true;
4975
4976 // If no caller callsites were already assigned to a clone of this
4977 // function, we can simply assign this clone to the new func clone
4978 // and update all callers to it, then skip to the next clone.
4979 if (!CallerAssignedToCloneOfFunc) {
4980 AssignCallsiteCloneToFuncClone(
4981 NewFuncClone, Call, Clone,
4982 AllocationCallToContextNodeMap.count(Call));
4983 for (auto &CE : Clone->CallerEdges) {
4984 // Ignore any caller that does not have a recorded callsite Call.
4985 if (!CE->Caller->hasCall())
4986 continue;
4987 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4988 }
4989 continue;
4990 }
4991
4992 // We may need to do additional node cloning in this case.
4993 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4994 // that were previously assigned to call PreviousAssignedFuncClone,
4995 // to record that they now call NewFuncClone.
4996 // The none type edge removal may remove some of this Clone's caller
4997 // edges, if it is reached via another of its caller's callees.
4998 // Iterate over a copy and skip any that were removed.
4999 auto CallerEdges = Clone->CallerEdges;
5000 for (auto CE : CallerEdges) {
5001 // Skip any that have been removed on an earlier iteration.
5002 if (CE->isRemoved()) {
5003 assert(!is_contained(Clone->CallerEdges, CE));
5004 continue;
5005 }
5006 assert(CE);
5007 // Ignore any caller that does not have a recorded callsite Call.
5008 if (!CE->Caller->hasCall())
5009 continue;
5010
5011 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
5012 // We subsequently fall through to later handling that
5013 // will perform any additional cloning required for
5014 // callers that were calling other function clones.
5015 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
5016 PreviousAssignedFuncClone)
5017 continue;
5018
5019 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5020
5021 // If we are cloning a function that was already assigned to some
5022 // callers, then essentially we are creating new callsite clones
5023 // of the other callsites in that function that are reached by those
5024 // callers. Clone the other callees of the current callsite's caller
5025 // that were already assigned to PreviousAssignedFuncClone
5026 // accordingly. This is important since we subsequently update the
5027 // calls from the nodes in the graph and their assignments to callee
5028 // functions recorded in CallsiteToCalleeFuncCloneMap.
5029 // The none type edge removal may remove some of this caller's
5030 // callee edges, if it is reached via another of its callees.
5031 // Iterate over a copy and skip any that were removed.
5032 auto CalleeEdges = CE->Caller->CalleeEdges;
5033 for (auto CalleeEdge : CalleeEdges) {
5034 // Skip any that have been removed on an earlier iteration when
5035 // cleaning up newly None type callee edges.
5036 if (CalleeEdge->isRemoved()) {
5037 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
5038 continue;
5039 }
5040 assert(CalleeEdge);
5041 ContextNode *Callee = CalleeEdge->Callee;
5042 // Skip the current callsite, we are looking for other
5043 // callsites Caller calls, as well as any that does not have a
5044 // recorded callsite Call.
5045 if (Callee == Clone || !Callee->hasCall())
5046 continue;
5047 // Skip direct recursive calls. We don't need/want to clone the
5048 // caller node again, and this loop will not behave as expected if
5049 // we tried.
5050 if (Callee == CalleeEdge->Caller)
5051 continue;
5052 ContextNode *NewClone =
5053 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
5054 // Moving the edge may have resulted in some none type
5055 // callee edges on the original Callee.
5056 removeNoneTypeCalleeEdges(Callee);
5057 // Update NewClone with the new Call clone of this callsite's Call
5058 // created for the new function clone created earlier.
5059 // Recall that we have already ensured when building the graph
5060 // that each caller can only call callsites within the same
5061 // function, so we are guaranteed that Callee Call is in the
5062 // current OrigFunc.
5063 // CallMap is set up as indexed by original Call at clone 0.
5064 CallInfo OrigCall(Callee->getOrigNode()->Call);
5065 OrigCall.setCloneNo(0);
5066 DenseMap<CallInfo, CallInfo> &CallMap =
5067 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
5068 assert(CallMap.count(OrigCall));
5069 CallInfo NewCall(CallMap[OrigCall]);
5070 assert(NewCall);
5071 NewClone->setCall(NewCall);
5072 // Need to do the same for all matching calls.
5073 for (auto &MatchingCall : NewClone->MatchingCalls) {
5074 CallInfo OrigMatchingCall(MatchingCall);
5075 OrigMatchingCall.setCloneNo(0);
5076 assert(CallMap.count(OrigMatchingCall));
5077 CallInfo NewCall(CallMap[OrigMatchingCall]);
5078 assert(NewCall);
5079 // Updates the call in the list.
5080 MatchingCall = NewCall;
5081 }
5082 }
5083 }
5084 // Fall through to handling below to perform the recording of the
5085 // function for this callsite clone. This enables handling of cases
5086 // where the callers were assigned to different clones of a function.
5087 }
5088
5089 auto FindFirstAvailFuncClone = [&]() {
5090 // Find first function in FuncCloneInfos without an assigned
5091 // clone of this callsite Node. We should always have one
5092 // available at this point due to the earlier cloning when the
5093 // FuncCloneInfos size was smaller than the clone number.
5094 for (auto &CF : FuncCloneInfos) {
5095 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
5096 return CF.FuncClone;
5097 }
5099 "Expected an available func clone for this callsite clone");
5100 };
5101
5102 // See if we can use existing function clone. Walk through
5103 // all caller edges to see if any have already been assigned to
5104 // a clone of this callsite's function. If we can use it, do so. If not,
5105 // because that function clone is already assigned to a different clone
5106 // of this callsite, then we need to clone again.
5107 // Basically, this checking is needed to handle the case where different
5108 // caller functions/callsites may need versions of this function
5109 // containing different mixes of callsite clones across the different
5110 // callsites within the function. If that happens, we need to create
5111 // additional function clones to handle the various combinations.
5112 //
5113 // Keep track of any new clones of this callsite created by the
5114 // following loop, as well as any existing clone that we decided to
5115 // assign this clone to.
5116 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
5117 FuncInfo FuncCloneAssignedToCurCallsiteClone;
5118 // Iterate over a copy of Clone's caller edges, since we may need to
5119 // remove edges in the moveEdgeTo* methods, and this simplifies the
5120 // handling and makes it less error-prone.
5121 auto CloneCallerEdges = Clone->CallerEdges;
5122 for (auto &Edge : CloneCallerEdges) {
5123 // Skip removed edges (due to direct recursive edges updated when
5124 // updating callee edges when moving an edge and subsequently
5125 // removed by call to removeNoneTypeCalleeEdges on the Clone).
5126 if (Edge->isRemoved())
5127 continue;
5128 // Ignore any caller that does not have a recorded callsite Call.
5129 if (!Edge->Caller->hasCall())
5130 continue;
5131 // If this caller already assigned to call a version of OrigFunc, need
5132 // to ensure we can assign this callsite clone to that function clone.
5133 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
5134 FuncInfo FuncCloneCalledByCaller =
5135 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5136 // First we need to confirm that this function clone is available
5137 // for use by this callsite node clone.
5138 //
5139 // While FuncCloneToCurNodeCloneMap is built only for this Node and
5140 // its callsite clones, one of those callsite clones X could have
5141 // been assigned to the same function clone called by Edge's caller
5142 // - if Edge's caller calls another callsite within Node's original
5143 // function, and that callsite has another caller reaching clone X.
5144 // We need to clone Node again in this case.
5145 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
5146 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
5147 Clone) ||
5148 // Detect when we have multiple callers of this callsite that
5149 // have already been assigned to specific, and different, clones
5150 // of OrigFunc (due to other unrelated callsites in Func they
5151 // reach via call contexts). Is this Clone of callsite Node
5152 // assigned to a different clone of OrigFunc? If so, clone Node
5153 // again.
5154 (FuncCloneAssignedToCurCallsiteClone &&
5155 FuncCloneAssignedToCurCallsiteClone !=
5156 FuncCloneCalledByCaller)) {
5157 // We need to use a different newly created callsite clone, in
5158 // order to assign it to another new function clone on a
5159 // subsequent iteration over the Clones array (adjusted below).
5160 // Note we specifically do not reset the
5161 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
5162 // when this new clone is processed later we know which version of
5163 // the function to copy (so that other callsite clones we have
5164 // assigned to that function clone are properly cloned over). See
5165 // comments in the function cloning handling earlier.
5166
5167 // Check if we already have cloned this callsite again while
5168 // walking through caller edges, for a caller calling the same
5169 // function clone. If so, we can move this edge to that new clone
5170 // rather than creating yet another new clone.
5171 if (FuncCloneToNewCallsiteCloneMap.count(
5172 FuncCloneCalledByCaller)) {
5173 ContextNode *NewClone =
5174 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
5175 moveEdgeToExistingCalleeClone(Edge, NewClone);
5176 // Cleanup any none type edges cloned over.
5177 removeNoneTypeCalleeEdges(NewClone);
5178 } else {
5179 // Create a new callsite clone.
5180 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
5181 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
5182 NewClone;
5183 // Add to list of clones and process later.
5184 ClonesWorklist.push_back(NewClone);
5185 }
5186 // Moving the caller edge may have resulted in some none type
5187 // callee edges.
5188 removeNoneTypeCalleeEdges(Clone);
5189 // We will handle the newly created callsite clone in a subsequent
5190 // iteration over this Node's Clones.
5191 continue;
5192 }
5193
5194 // Otherwise, we can use the function clone already assigned to this
5195 // caller.
5196 if (!FuncCloneAssignedToCurCallsiteClone) {
5197 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
5198 // Assign Clone to FuncCloneCalledByCaller
5199 AssignCallsiteCloneToFuncClone(
5200 FuncCloneCalledByCaller, Call, Clone,
5201 AllocationCallToContextNodeMap.count(Call));
5202 } else
5203 // Don't need to do anything - callsite is already calling this
5204 // function clone.
5205 assert(FuncCloneAssignedToCurCallsiteClone ==
5206 FuncCloneCalledByCaller);
5207
5208 } else {
5209 // We have not already assigned this caller to a version of
5210 // OrigFunc. Do the assignment now.
5211
5212 // First check if we have already assigned this callsite clone to a
5213 // clone of OrigFunc for another caller during this iteration over
5214 // its caller edges.
5215 if (!FuncCloneAssignedToCurCallsiteClone) {
5216 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5217 assert(FuncCloneAssignedToCurCallsiteClone);
5218 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
5219 AssignCallsiteCloneToFuncClone(
5220 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5221 AllocationCallToContextNodeMap.count(Call));
5222 } else
5223 assert(FuncCloneToCurNodeCloneMap
5224 [FuncCloneAssignedToCurCallsiteClone] == Clone);
5225 // Update callers to record function version called.
5226 RecordCalleeFuncOfCallsite(Edge->Caller,
5227 FuncCloneAssignedToCurCallsiteClone);
5228 }
5229 }
5230 // If we didn't assign a function clone to this callsite clone yet, e.g.
5231 // none of its callers has a non-null call, do the assignment here.
5232 // We want to ensure that every callsite clone is assigned to some
5233 // function clone, so that the call updates below work as expected.
5234 // In particular if this is the original callsite, we want to ensure it
5235 // is assigned to the original function, otherwise the original function
5236 // will appear available for assignment to other callsite clones,
5237 // leading to unintended effects. For one, the unknown and not updated
5238 // callers will call into cloned paths leading to the wrong hints,
5239 // because they still call the original function (clone 0). Also,
5240 // because all callsites start out as being clone 0 by default, we can't
5241 // easily distinguish between callsites explicitly assigned to clone 0
5242 // vs those never assigned, which can lead to multiple updates of the
5243 // calls when invoking updateCall below, with mismatched clone values.
5244 // TODO: Add a flag to the callsite nodes or some other mechanism to
5245 // better distinguish and identify callsite clones that are not getting
5246 // assigned to function clones as expected.
5247 if (!FuncCloneAssignedToCurCallsiteClone) {
5248 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5249 assert(FuncCloneAssignedToCurCallsiteClone &&
5250 "No available func clone for this callsite clone");
5251 AssignCallsiteCloneToFuncClone(
5252 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5253 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5254 }
5255 }
5256 if (VerifyCCG) {
5258 for (const auto &PE : Node->CalleeEdges)
5260 for (const auto &CE : Node->CallerEdges)
5262 for (auto *Clone : Node->Clones) {
5264 for (const auto &PE : Clone->CalleeEdges)
5266 for (const auto &CE : Clone->CallerEdges)
5268 }
5269 }
5270 }
5271
5272 if (FuncCloneInfos.size() < 2)
5273 continue;
5274
5275 // In this case there is more than just the original function copy.
5276 // Record call clones of any callsite nodes in the function that did not
5277 // themselves get cloned for all of the function clones.
5278 for (auto &Call : CallsWithMetadata) {
5279 ContextNode *Node = getNodeForInst(Call);
5280 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5281 continue;
5282 // If Node has enough clones already to cover all function clones, we can
5283 // skip it. Need to add one for the original copy.
5284 // Use >= in case there were clones that were skipped due to having empty
5285 // context ids
5286 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5287 continue;
5288 // First collect all function clones we cloned this callsite node for.
5289 // They may not be sequential due to empty clones e.g.
5290 DenseSet<unsigned> NodeCallClones;
5291 for (auto *C : Node->Clones)
5292 NodeCallClones.insert(C->Call.cloneNo());
5293 unsigned I = 0;
5294 // Now check all the function clones.
5295 for (auto &FC : FuncCloneInfos) {
5296 // Function clones should be sequential.
5297 assert(FC.FuncClone.cloneNo() == I);
5298 // Skip the first clone which got the original call.
5299 // Also skip any other clones created for this Node.
5300 if (++I == 1 || NodeCallClones.contains(I)) {
5301 continue;
5302 }
5303 // Record the call clones created for this callsite in this function
5304 // clone.
5305 auto &CallVector = UnassignedCallClones[Node][I];
5306 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5307 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5308 CallInfo CallClone = It->second;
5309 CallVector.push_back(CallClone);
5310 } else {
5311 // All but the original clone (skipped earlier) should have an entry
5312 // for all calls.
5313 assert(false && "Expected to find call in CallMap");
5314 }
5315 // Need to do the same for all matching calls.
5316 for (auto &MatchingCall : Node->MatchingCalls) {
5317 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5318 CallInfo CallClone = It->second;
5319 CallVector.push_back(CallClone);
5320 } else {
5321 // All but the original clone (skipped earlier) should have an entry
5322 // for all calls.
5323 assert(false && "Expected to find call in CallMap");
5324 }
5325 }
5326 }
5327 }
5328 }
5329
5330 uint8_t BothTypes =
5331 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5332
5333 auto UpdateCalls = [&](ContextNode *Node,
5334 DenseSet<const ContextNode *> &Visited,
5335 auto &&UpdateCalls) {
5336 auto Inserted = Visited.insert(Node);
5337 if (!Inserted.second)
5338 return;
5339
5340 for (auto *Clone : Node->Clones)
5341 UpdateCalls(Clone, Visited, UpdateCalls);
5342
5343 for (auto &Edge : Node->CallerEdges)
5344 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5345
5346 // Skip if either no call to update, or if we ended up with no context ids
5347 // (we moved all edges onto other clones).
5348 if (!Node->hasCall() || Node->emptyContextIds())
5349 return;
5350
5351 if (Node->IsAllocation) {
5352 auto AT = allocTypeToUse(Node->AllocTypes);
5353 // If the allocation type is ambiguous, and more aggressive hinting
5354 // has been enabled via the MinClonedColdBytePercent flag, see if this
5355 // allocation should be hinted cold anyway because its fraction cold bytes
5356 // allocated is at least the given threshold.
5357 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5358 !ContextIdToContextSizeInfos.empty()) {
5359 uint64_t TotalCold = 0;
5360 uint64_t Total = 0;
5361 for (auto Id : Node->getContextIds()) {
5362 auto TypeI = ContextIdToAllocationType.find(Id);
5363 assert(TypeI != ContextIdToAllocationType.end());
5364 auto CSI = ContextIdToContextSizeInfos.find(Id);
5365 if (CSI != ContextIdToContextSizeInfos.end()) {
5366 for (auto &Info : CSI->second) {
5367 Total += Info.TotalSize;
5368 if (TypeI->second == AllocationType::Cold)
5369 TotalCold += Info.TotalSize;
5370 }
5371 }
5372 }
5373 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5374 AT = AllocationType::Cold;
5375 }
5376 updateAllocationCall(Node->Call, AT);
5377 assert(Node->MatchingCalls.empty());
5378 return;
5379 }
5380
5381 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5382 return;
5383
5384 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5385 updateCall(Node->Call, CalleeFunc);
5386 // Update all the matching calls as well.
5387 for (auto &Call : Node->MatchingCalls)
5388 updateCall(Call, CalleeFunc);
5389
5390 // Now update all calls recorded earlier that are still in function clones
5391 // which don't have a clone of this callsite node.
5392 if (!UnassignedCallClones.contains(Node))
5393 return;
5394 DenseSet<unsigned> NodeCallClones;
5395 for (auto *C : Node->Clones)
5396 NodeCallClones.insert(C->Call.cloneNo());
5397 // Note that we already confirmed Node is in this map a few lines above.
5398 auto &ClonedCalls = UnassignedCallClones[Node];
5399 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5400 // Should start at 1 as we never create an entry for original node.
5401 assert(CloneNo > 0);
5402 // If we subsequently created a clone, skip this one.
5403 if (NodeCallClones.contains(CloneNo))
5404 continue;
5405 // Use the original Node's CalleeFunc.
5406 for (auto &Call : CallVector)
5407 updateCall(Call, CalleeFunc);
5408 }
5409 };
5410
5411 // Performs DFS traversal starting from allocation nodes to update calls to
5412 // reflect cloning decisions recorded earlier. For regular LTO this will
5413 // update the actual calls in the IR to call the appropriate function clone
5414 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5415 // are recorded in the summary entries.
5416 DenseSet<const ContextNode *> Visited;
5417 for (auto &Entry : AllocationCallToContextNodeMap)
5418 UpdateCalls(Entry.second, Visited, UpdateCalls);
5419
5420 return Changed;
5421}
5422
5423// Compute a SHA1 hash of the callsite and alloc version information of clone I
5424// in the summary, to use in detection of duplicate clones.
5426 SHA1 Hasher;
5427 // Update hash with any callsites that call non-default (non-zero) callee
5428 // versions.
5429 for (auto &SN : FS->callsites()) {
5430 // In theory all callsites and allocs in this function should have the same
5431 // number of clone entries, but handle any discrepancies gracefully below
5432 // for NDEBUG builds.
5433 assert(
5434 SN.Clones.size() > I &&
5435 "Callsite summary has fewer entries than other summaries in function");
5436 if (SN.Clones.size() <= I || !SN.Clones[I])
5437 continue;
5438 uint8_t Data[sizeof(SN.Clones[I])];
5439 support::endian::write32le(Data, SN.Clones[I]);
5440 Hasher.update(Data);
5441 }
5442 // Update hash with any allocs that have non-default (non-None) hints.
5443 for (auto &AN : FS->allocs()) {
5444 // In theory all callsites and allocs in this function should have the same
5445 // number of clone entries, but handle any discrepancies gracefully below
5446 // for NDEBUG builds.
5447 assert(AN.Versions.size() > I &&
5448 "Alloc summary has fewer entries than other summaries in function");
5449 if (AN.Versions.size() <= I ||
5450 (AllocationType)AN.Versions[I] == AllocationType::None)
5451 continue;
5452 Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
5453 }
5454 return support::endian::read64le(Hasher.result().data());
5455}
5456
5458 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5460 &FuncToAliasMap,
5461 FunctionSummary *FS) {
5462 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5463 // We might have created this when adjusting callsite in another
5464 // function. It should be a declaration.
5465 assert(DeclGV->isDeclaration());
5466 NewGV->takeName(DeclGV);
5467 DeclGV->replaceAllUsesWith(NewGV);
5468 DeclGV->eraseFromParent();
5469 };
5470
5471 // Handle aliases to this function, and create analogous alias clones to the
5472 // provided clone of this function.
5473 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5474 if (!FuncToAliasMap.count(&F))
5475 return;
5476 for (auto *A : FuncToAliasMap[&F]) {
5477 std::string AliasName = getMemProfFuncName(A->getName(), I);
5478 auto *PrevA = M.getNamedAlias(AliasName);
5479 auto *NewA = GlobalAlias::create(A->getValueType(),
5480 A->getType()->getPointerAddressSpace(),
5481 A->getLinkage(), AliasName, NewF);
5482 NewA->copyAttributesFrom(A);
5483 if (PrevA)
5484 TakeDeclNameAndReplace(PrevA, NewA);
5485 }
5486 };
5487
5488 // The first "clone" is the original copy, we should only call this if we
5489 // needed to create new clones.
5490 assert(NumClones > 1);
5492 VMaps.reserve(NumClones - 1);
5493 FunctionsClonedThinBackend++;
5494
5495 // Map of hash of callsite/alloc versions to the instantiated function clone
5496 // (possibly the original) implementing those calls. Used to avoid
5497 // instantiating duplicate function clones.
5498 // FIXME: Ideally the thin link would not generate such duplicate clones to
5499 // start with, but right now it happens due to phase ordering in the function
5500 // assignment and possible new clones that produces. We simply make each
5501 // duplicate an alias to the matching instantiated clone recorded in the map
5502 // (except for available_externally which are made declarations as they would
5503 // be aliases in the prevailing module, and available_externally aliases are
5504 // not well supported right now).
5506
5507 // Save the hash of the original function version.
5508 HashToFunc[ComputeHash(FS, 0)] = &F;
5509
5510 for (unsigned I = 1; I < NumClones; I++) {
5511 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5512 std::string Name = getMemProfFuncName(F.getName(), I);
5513 auto Hash = ComputeHash(FS, I);
5514 // If this clone would duplicate a previously seen clone, don't generate the
5515 // duplicate clone body, just make an alias to satisfy any (potentially
5516 // cross-module) references.
5517 if (HashToFunc.contains(Hash)) {
5518 FunctionCloneDuplicatesThinBackend++;
5519 auto *Func = HashToFunc[Hash];
5520 if (Func->hasAvailableExternallyLinkage()) {
5521 // Skip these as EliminateAvailableExternallyPass does not handle
5522 // available_externally aliases correctly and we end up with an
5523 // available_externally alias to a declaration. Just create a
5524 // declaration for now as we know we will have a definition in another
5525 // module.
5526 auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
5527 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5528 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5529 continue;
5530 }
5531 auto *PrevF = M.getFunction(Name);
5532 auto *Alias = GlobalAlias::create(Name, Func);
5533 if (PrevF)
5534 TakeDeclNameAndReplace(PrevF, Alias);
5535 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5536 << "created clone alias " << ore::NV("Alias", Alias));
5537
5538 // Now handle aliases to this function, and clone those as well.
5539 CloneFuncAliases(Func, I);
5540 continue;
5541 }
5542 auto *NewF = CloneFunction(&F, *VMaps.back());
5543 HashToFunc[Hash] = NewF;
5544 FunctionClonesThinBackend++;
5545 // Strip memprof and callsite metadata from clone as they are no longer
5546 // needed.
5547 for (auto &BB : *NewF) {
5548 for (auto &Inst : BB) {
5549 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5550 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5551 }
5552 }
5553 auto *PrevF = M.getFunction(Name);
5554 if (PrevF)
5555 TakeDeclNameAndReplace(PrevF, NewF);
5556 else
5557 NewF->setName(Name);
5558 updateSubprogramLinkageName(NewF, Name);
5559 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5560 << "created clone " << ore::NV("NewFunction", NewF));
5561
5562 // Now handle aliases to this function, and clone those as well.
5563 CloneFuncAliases(NewF, I);
5564 }
5565 return VMaps;
5566}
5567
5568// Locate the summary for F. This is complicated by the fact that it might
5569// have been internalized or promoted.
5571 const ModuleSummaryIndex *ImportSummary,
5572 const Function *CallingFunc = nullptr) {
5573 // FIXME: Ideally we would retain the original GUID in some fashion on the
5574 // function (e.g. as metadata), but for now do our best to locate the
5575 // summary without that information.
5576 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5577 if (!TheFnVI)
5578 // See if theFn was internalized, by checking index directly with
5579 // original name (this avoids the name adjustment done by getGUID() for
5580 // internal symbols).
5581 TheFnVI = ImportSummary->getValueInfo(
5583 if (TheFnVI)
5584 return TheFnVI;
5585 // Now query with the original name before any promotion was performed.
5586 StringRef OrigName =
5588 // When this pass is enabled, we always add thinlto_src_file provenance
5589 // metadata to imported function definitions, which allows us to recreate the
5590 // original internal symbol's GUID.
5591 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5592 // If this is a call to an imported/promoted local for which we didn't import
5593 // the definition, the metadata will not exist on the declaration. However,
5594 // since we are doing this early, before any inlining in the LTO backend, we
5595 // can simply look at the metadata on the calling function which must have
5596 // been from the same module if F was an internal symbol originally.
5597 if (!SrcFileMD && F.isDeclaration()) {
5598 // We would only call this for a declaration for a direct callsite, in which
5599 // case the caller would have provided the calling function pointer.
5600 assert(CallingFunc);
5601 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5602 // If this is a promoted local (OrigName != F.getName()), since this is a
5603 // declaration, it must be imported from a different module and therefore we
5604 // should always find the metadata on its calling function. Any call to a
5605 // promoted local that came from this module should still be a definition.
5606 assert(SrcFileMD || OrigName == F.getName());
5607 }
5608 StringRef SrcFile = M.getSourceFileName();
5609 if (SrcFileMD)
5610 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5611 std::string OrigId = GlobalValue::getGlobalIdentifier(
5612 OrigName, GlobalValue::InternalLinkage, SrcFile);
5613 TheFnVI = ImportSummary->getValueInfo(
5615 // Internal func in original module may have gotten a numbered suffix if we
5616 // imported an external function with the same name. This happens
5617 // automatically during IR linking for naming conflicts. It would have to
5618 // still be internal in that case (otherwise it would have been renamed on
5619 // promotion in which case we wouldn't have a naming conflict).
5620 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5621 F.getName().contains('.')) {
5622 OrigName = F.getName().rsplit('.').first;
5624 OrigName, GlobalValue::InternalLinkage, SrcFile);
5625 TheFnVI = ImportSummary->getValueInfo(
5627 }
5628 // The only way we may not have a VI is if this is a declaration created for
5629 // an imported reference. For distributed ThinLTO we may not have a VI for
5630 // such declarations in the distributed summary.
5631 assert(TheFnVI || F.isDeclaration());
5632 return TheFnVI;
5633}
5634
5635bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5636 Module &M) {
5637 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5638 Symtab = std::make_unique<InstrProfSymtab>();
5639 // Don't add canonical names, to avoid multiple functions to the symtab
5640 // when they both have the same root name with "." suffixes stripped.
5641 // If we pick the wrong one then this could lead to incorrect ICP and calling
5642 // a memprof clone that we don't actually create (resulting in linker unsats).
5643 // What this means is that the GUID of the function (or its PGOFuncName
5644 // metadata) *must* match that in the VP metadata to allow promotion.
5645 // In practice this should not be a limitation, since local functions should
5646 // have PGOFuncName metadata and global function names shouldn't need any
5647 // special handling (they should not get the ".llvm.*" suffix that the
5648 // canonicalization handling is attempting to strip).
5649 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5650 std::string SymtabFailure = toString(std::move(E));
5651 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5652 return false;
5653 }
5654 return true;
5655}
5656
5657#ifndef NDEBUG
5658// Sanity check that the MIB stack ids match between the summary and
5659// instruction metadata.
5661 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5662 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5663 const ModuleSummaryIndex *ImportSummary) {
5664 auto MIBIter = AllocNode.MIBs.begin();
5665 for (auto &MDOp : MemProfMD->operands()) {
5666 assert(MIBIter != AllocNode.MIBs.end());
5667 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5668 auto *MIBMD = cast<const MDNode>(MDOp);
5669 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5670 assert(StackMDNode);
5671 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5672 auto ContextIterBegin =
5673 StackContext.beginAfterSharedPrefix(CallsiteContext);
5674 // Skip the checking on the first iteration.
5675 uint64_t LastStackContextId =
5676 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5677 : 0;
5678 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5679 ++ContextIter) {
5680 // If this is a direct recursion, simply skip the duplicate
5681 // entries, to be consistent with how the summary ids were
5682 // generated during ModuleSummaryAnalysis.
5683 if (LastStackContextId == *ContextIter)
5684 continue;
5685 LastStackContextId = *ContextIter;
5686 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5687 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5688 *ContextIter);
5689 StackIdIndexIter++;
5690 }
5691 MIBIter++;
5692 }
5693}
5694#endif
5695
5696bool MemProfContextDisambiguation::applyImport(Module &M) {
5697 assert(ImportSummary);
5698 bool Changed = false;
5699
5700 // We also need to clone any aliases that reference cloned functions, because
5701 // the modified callsites may invoke via the alias. Keep track of the aliases
5702 // for each function.
5703 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5704 FuncToAliasMap;
5705 for (auto &A : M.aliases()) {
5706 auto *Aliasee = A.getAliaseeObject();
5707 if (auto *F = dyn_cast<Function>(Aliasee))
5708 FuncToAliasMap[F].insert(&A);
5709 }
5710
5711 if (!initializeIndirectCallPromotionInfo(M))
5712 return false;
5713
5714 for (auto &F : M) {
5715 if (F.isDeclaration() || isMemProfClone(F))
5716 continue;
5717
5718 OptimizationRemarkEmitter ORE(&F);
5719
5721 bool ClonesCreated = false;
5722 unsigned NumClonesCreated = 0;
5723 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5724 // We should at least have version 0 which is the original copy.
5725 assert(NumClones > 0);
5726 // If only one copy needed use original.
5727 if (NumClones == 1)
5728 return;
5729 // If we already performed cloning of this function, confirm that the
5730 // requested number of clones matches (the thin link should ensure the
5731 // number of clones for each constituent callsite is consistent within
5732 // each function), before returning.
5733 if (ClonesCreated) {
5734 assert(NumClonesCreated == NumClones);
5735 return;
5736 }
5737 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5738 // The first "clone" is the original copy, which doesn't have a VMap.
5739 assert(VMaps.size() == NumClones - 1);
5740 Changed = true;
5741 ClonesCreated = true;
5742 NumClonesCreated = NumClones;
5743 };
5744
5745 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5746 Function *CalledFunction, FunctionSummary *FS) {
5747 // Perform cloning if not yet done.
5748 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5749
5750 assert(!isMemProfClone(*CalledFunction));
5751
5752 // Because we update the cloned calls by calling setCalledOperand (see
5753 // comment below), out of an abundance of caution make sure the called
5754 // function was actually the called operand (or its aliasee). We also
5755 // strip pointer casts when looking for calls (to match behavior during
5756 // summary generation), however, with opaque pointers in theory this
5757 // should not be an issue. Note we still clone the current function
5758 // (containing this call) above, as that could be needed for its callers.
5759 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5760 if (CalledFunction != CB->getCalledOperand() &&
5761 (!GA || CalledFunction != GA->getAliaseeObject())) {
5762 SkippedCallsCloning++;
5763 return;
5764 }
5765 // Update the calls per the summary info.
5766 // Save orig name since it gets updated in the first iteration
5767 // below.
5768 auto CalleeOrigName = CalledFunction->getName();
5769 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5770 // If the VMap is empty, this clone was a duplicate of another and was
5771 // created as an alias or a declaration.
5772 if (J > 0 && VMaps[J - 1]->empty())
5773 continue;
5774 // Do nothing if this version calls the original version of its
5775 // callee.
5776 if (!StackNode.Clones[J])
5777 continue;
5778 auto NewF = M.getOrInsertFunction(
5779 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5780 CalledFunction->getFunctionType());
5781 CallBase *CBClone;
5782 // Copy 0 is the original function.
5783 if (!J)
5784 CBClone = CB;
5785 else
5786 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5787 // Set the called operand directly instead of calling setCalledFunction,
5788 // as the latter mutates the function type on the call. In rare cases
5789 // we may have a slightly different type on a callee function
5790 // declaration due to it being imported from a different module with
5791 // incomplete types. We really just want to change the name of the
5792 // function to the clone, and not make any type changes.
5793 CBClone->setCalledOperand(NewF.getCallee());
5794 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5795 << ore::NV("Call", CBClone) << " in clone "
5796 << ore::NV("Caller", CBClone->getFunction())
5797 << " assigned to call function clone "
5798 << ore::NV("Callee", NewF.getCallee()));
5799 }
5800 };
5801
5802 // Locate the summary for F.
5803 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5804 // If not found, this could be an imported local (see comment in
5805 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5806 // module (where it would have been promoted to global scope so should
5807 // satisfy any reference in this module).
5808 if (!TheFnVI)
5809 continue;
5810
5811 auto *GVSummary =
5812 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5813 if (!GVSummary) {
5814 // Must have been imported, use the summary which matches the definition。
5815 // (might be multiple if this was a linkonce_odr).
5816 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5817 assert(SrcModuleMD &&
5818 "enable-import-metadata is needed to emit thinlto_src_module");
5819 StringRef SrcModule =
5820 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5821 for (auto &GVS : TheFnVI.getSummaryList()) {
5822 if (GVS->modulePath() == SrcModule) {
5823 GVSummary = GVS.get();
5824 break;
5825 }
5826 }
5827 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5828 }
5829
5830 // If this was an imported alias skip it as we won't have the function
5831 // summary, and it should be cloned in the original module.
5832 if (isa<AliasSummary>(GVSummary))
5833 continue;
5834
5835 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5836
5837 if (FS->allocs().empty() && FS->callsites().empty())
5838 continue;
5839
5840 auto SI = FS->callsites().begin();
5841 auto AI = FS->allocs().begin();
5842
5843 // To handle callsite infos synthesized for tail calls which have missing
5844 // frames in the profiled context, map callee VI to the synthesized callsite
5845 // info.
5846 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5847 // Iterate the callsites for this function in reverse, since we place all
5848 // those synthesized for tail calls at the end.
5849 for (auto CallsiteIt = FS->callsites().rbegin();
5850 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5851 auto &Callsite = *CallsiteIt;
5852 // Stop as soon as we see a non-synthesized callsite info (see comment
5853 // above loop). All the entries added for discovered tail calls have empty
5854 // stack ids.
5855 if (!Callsite.StackIdIndices.empty())
5856 break;
5857 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5858 }
5859
5860 // Keeps track of needed ICP for the function.
5861 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5862
5863 // Assume for now that the instructions are in the exact same order
5864 // as when the summary was created, but confirm this is correct by
5865 // matching the stack ids.
5866 for (auto &BB : F) {
5867 for (auto &I : BB) {
5868 auto *CB = dyn_cast<CallBase>(&I);
5869 // Same handling as when creating module summary.
5870 if (!mayHaveMemprofSummary(CB))
5871 continue;
5872
5873 auto *CalledValue = CB->getCalledOperand();
5874 auto *CalledFunction = CB->getCalledFunction();
5875 if (CalledValue && !CalledFunction) {
5876 CalledValue = CalledValue->stripPointerCasts();
5877 // Stripping pointer casts can reveal a called function.
5878 CalledFunction = dyn_cast<Function>(CalledValue);
5879 }
5880 // Check if this is an alias to a function. If so, get the
5881 // called aliasee for the checks below.
5882 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5883 assert(!CalledFunction &&
5884 "Expected null called function in callsite for alias");
5885 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5886 }
5887
5888 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5889 I.getMetadata(LLVMContext::MD_callsite));
5890 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5891
5892 // Include allocs that were already assigned a memprof function
5893 // attribute in the statistics. Only do this for those that do not have
5894 // memprof metadata, since we add an "ambiguous" memprof attribute by
5895 // default.
5896 if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
5897 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5898 ? AllocTypeColdThinBackend++
5899 : AllocTypeNotColdThinBackend++;
5900 OrigAllocsThinBackend++;
5901 AllocVersionsThinBackend++;
5902 if (!MaxAllocVersionsThinBackend)
5903 MaxAllocVersionsThinBackend = 1;
5904 continue;
5905 }
5906
5907 if (MemProfMD) {
5908 // Consult the next alloc node.
5909 assert(AI != FS->allocs().end());
5910 auto &AllocNode = *(AI++);
5911
5912#ifndef NDEBUG
5913 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5914 ImportSummary);
5915#endif
5916
5917 // Perform cloning if not yet done.
5918 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5919
5920 OrigAllocsThinBackend++;
5921 AllocVersionsThinBackend += AllocNode.Versions.size();
5922 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5923 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5924
5925 // If there is only one version that means we didn't end up
5926 // considering this function for cloning, and in that case the alloc
5927 // will still be none type or should have gotten the default NotCold.
5928 // Skip that after calling clone helper since that does some sanity
5929 // checks that confirm we haven't decided yet that we need cloning.
5930 // We might have a single version that is cold due to the
5931 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5932 // case.
5933 if (AllocNode.Versions.size() == 1 &&
5934 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5935 assert((AllocationType)AllocNode.Versions[0] ==
5936 AllocationType::NotCold ||
5937 (AllocationType)AllocNode.Versions[0] ==
5938 AllocationType::None);
5939 UnclonableAllocsThinBackend++;
5940 continue;
5941 }
5942
5943 // All versions should have a singular allocation type.
5944 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5945 return Type == ((uint8_t)AllocationType::NotCold |
5946 (uint8_t)AllocationType::Cold);
5947 }));
5948
5949 // Update the allocation types per the summary info.
5950 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5951 // If the VMap is empty, this clone was a duplicate of another and
5952 // was created as an alias or a declaration.
5953 if (J > 0 && VMaps[J - 1]->empty())
5954 continue;
5955 // Ignore any that didn't get an assigned allocation type.
5956 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5957 continue;
5958 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5959 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5960 : AllocTypeNotColdThinBackend++;
5961 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
5962 auto A = llvm::Attribute::get(F.getContext(), "memprof",
5963 AllocTypeString);
5964 CallBase *CBClone;
5965 // Copy 0 is the original function.
5966 if (!J)
5967 CBClone = CB;
5968 else
5969 // Since VMaps are only created for new clones, we index with
5970 // clone J-1 (J==0 is the original clone and does not have a VMaps
5971 // entry).
5972 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5974 CBClone->addFnAttr(A);
5975 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5976 << ore::NV("AllocationCall", CBClone) << " in clone "
5977 << ore::NV("Caller", CBClone->getFunction())
5978 << " marked with memprof allocation attribute "
5979 << ore::NV("Attribute", AllocTypeString));
5980 }
5981 } else if (!CallsiteContext.empty()) {
5982 if (!CalledFunction) {
5983#ifndef NDEBUG
5984 // We should have skipped inline assembly calls.
5985 auto *CI = dyn_cast<CallInst>(CB);
5986 assert(!CI || !CI->isInlineAsm());
5987#endif
5988 // We should have skipped direct calls via a Constant.
5989 assert(CalledValue && !isa<Constant>(CalledValue));
5990
5991 // This is an indirect call, see if we have profile information and
5992 // whether any clones were recorded for the profiled targets (that
5993 // we synthesized CallsiteInfo summary records for when building the
5994 // index).
5995 auto NumClones =
5996 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
5997
5998 // Perform cloning if not yet done. This is done here in case
5999 // we don't need to do ICP, but might need to clone this
6000 // function as it is the target of other cloned calls.
6001 if (NumClones)
6002 CloneFuncIfNeeded(NumClones, FS);
6003 }
6004
6005 else {
6006 // Consult the next callsite node.
6007 assert(SI != FS->callsites().end());
6008 auto &StackNode = *(SI++);
6009
6010#ifndef NDEBUG
6011 // Sanity check that the stack ids match between the summary and
6012 // instruction metadata.
6013 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
6014 for (auto StackId : CallsiteContext) {
6015 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
6016 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
6017 StackId);
6018 StackIdIndexIter++;
6019 }
6020#endif
6021
6022 CloneCallsite(StackNode, CB, CalledFunction, FS);
6023 }
6024 } else if (CB->isTailCall() && CalledFunction) {
6025 // Locate the synthesized callsite info for the callee VI, if any was
6026 // created, and use that for cloning.
6027 ValueInfo CalleeVI =
6028 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
6029 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
6030 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
6031 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
6032 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
6033 }
6034 }
6035 }
6036 }
6037
6038 // Now do any promotion required for cloning.
6039 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
6040 }
6041
6042 // We skip some of the functions and instructions above, so remove all the
6043 // metadata in a single sweep here.
6044 for (auto &F : M) {
6045 // We can skip memprof clones because createFunctionClones already strips
6046 // the metadata from the newly created clones.
6047 if (F.isDeclaration() || isMemProfClone(F))
6048 continue;
6049 for (auto &BB : F) {
6050 for (auto &I : BB) {
6051 if (!isa<CallBase>(I))
6052 continue;
6053 I.setMetadata(LLVMContext::MD_memprof, nullptr);
6054 I.setMetadata(LLVMContext::MD_callsite, nullptr);
6055 }
6056 }
6057 }
6058
6059 return Changed;
6060}
6061
6062unsigned MemProfContextDisambiguation::recordICPInfo(
6063 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
6065 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
6066 // First see if we have profile information for this indirect call.
6067 uint32_t NumCandidates;
6068 uint64_t TotalCount;
6069 auto CandidateProfileData =
6070 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
6071 NumCandidates);
6072 if (CandidateProfileData.empty())
6073 return 0;
6074
6075 // Iterate through all of the candidate profiled targets along with the
6076 // CallsiteInfo summary records synthesized for them when building the index,
6077 // and see if any are cloned and/or refer to clones.
6078 bool ICPNeeded = false;
6079 unsigned NumClones = 0;
6080 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
6081 for (const auto &Candidate : CandidateProfileData) {
6082#ifndef NDEBUG
6083 auto CalleeValueInfo =
6084#endif
6085 ImportSummary->getValueInfo(Candidate.Value);
6086 // We might not have a ValueInfo if this is a distributed
6087 // ThinLTO backend and decided not to import that function.
6088 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
6089 assert(SI != AllCallsites.end());
6090 auto &StackNode = *(SI++);
6091 // See if any of the clones of the indirect callsite for this
6092 // profiled target should call a cloned version of the profiled
6093 // target. We only need to do the ICP here if so.
6094 ICPNeeded |= llvm::any_of(StackNode.Clones,
6095 [](unsigned CloneNo) { return CloneNo != 0; });
6096 // Every callsite in the same function should have been cloned the same
6097 // number of times.
6098 assert(!NumClones || NumClones == StackNode.Clones.size());
6099 NumClones = StackNode.Clones.size();
6100 }
6101 if (!ICPNeeded)
6102 return NumClones;
6103 // Save information for ICP, which is performed later to avoid messing up the
6104 // current function traversal.
6105 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
6106 TotalCount, CallsiteInfoStartIndex});
6107 return NumClones;
6108}
6109
6110void MemProfContextDisambiguation::performICP(
6111 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
6112 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
6113 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
6114 OptimizationRemarkEmitter &ORE) {
6115 // Now do any promotion required for cloning. Specifically, for each
6116 // recorded ICP candidate (which was only recorded because one clone of that
6117 // candidate should call a cloned target), we perform ICP (speculative
6118 // devirtualization) for each clone of the callsite, and update its callee
6119 // to the appropriate clone. Note that the ICP compares against the original
6120 // version of the target, which is what is in the vtable.
6121 for (auto &Info : ICallAnalysisInfo) {
6122 auto *CB = Info.CB;
6123 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
6124 auto TotalCount = Info.TotalCount;
6125 unsigned NumPromoted = 0;
6126 unsigned NumClones = 0;
6127
6128 for (auto &Candidate : Info.CandidateProfileData) {
6129 auto &StackNode = AllCallsites[CallsiteIndex++];
6130
6131 // All calls in the same function must have the same number of clones.
6132 assert(!NumClones || NumClones == StackNode.Clones.size());
6133 NumClones = StackNode.Clones.size();
6134
6135 // See if the target is in the module. If it wasn't imported, it is
6136 // possible that this profile could have been collected on a different
6137 // target (or version of the code), and we need to be conservative
6138 // (similar to what is done in the ICP pass).
6139 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
6140 if (TargetFunction == nullptr ||
6141 // Any ThinLTO global dead symbol removal should have already
6142 // occurred, so it should be safe to promote when the target is a
6143 // declaration.
6144 // TODO: Remove internal option once more fully tested.
6146 TargetFunction->isDeclaration())) {
6147 ORE.emit([&]() {
6148 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
6149 << "Memprof cannot promote indirect call: target with md5sum "
6150 << ore::NV("target md5sum", Candidate.Value) << " not found";
6151 });
6152 // FIXME: See if we can use the new declaration importing support to
6153 // at least get the declarations imported for this case. Hot indirect
6154 // targets should have been imported normally, however.
6155 continue;
6156 }
6157
6158 // Check if legal to promote
6159 const char *Reason = nullptr;
6160 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
6161 ORE.emit([&]() {
6162 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
6163 << "Memprof cannot promote indirect call to "
6164 << ore::NV("TargetFunction", TargetFunction)
6165 << " with count of " << ore::NV("TotalCount", TotalCount)
6166 << ": " << Reason;
6167 });
6168 continue;
6169 }
6170
6171 assert(!isMemProfClone(*TargetFunction));
6172
6173 // Handle each call clone, applying ICP so that each clone directly
6174 // calls the specified callee clone, guarded by the appropriate ICP
6175 // check.
6176 CallBase *CBClone = CB;
6177 for (unsigned J = 0; J < NumClones; J++) {
6178 // If the VMap is empty, this clone was a duplicate of another and was
6179 // created as an alias or a declaration.
6180 if (J > 0 && VMaps[J - 1]->empty())
6181 continue;
6182 // Copy 0 is the original function.
6183 if (J > 0)
6184 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6185 // We do the promotion using the original name, so that the comparison
6186 // is against the name in the vtable. Then just below, change the new
6187 // direct call to call the cloned function.
6188 auto &DirectCall =
6189 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
6190 TotalCount, isSamplePGO, &ORE);
6191 auto *TargetToUse = TargetFunction;
6192 // Call original if this version calls the original version of its
6193 // callee.
6194 if (StackNode.Clones[J]) {
6195 TargetToUse =
6196 cast<Function>(M.getOrInsertFunction(
6197 getMemProfFuncName(TargetFunction->getName(),
6198 StackNode.Clones[J]),
6199 TargetFunction->getFunctionType())
6200 .getCallee());
6201 }
6202 DirectCall.setCalledFunction(TargetToUse);
6203 // During matching we generate synthetic VP metadata for indirect calls
6204 // not already having any, from the memprof profile's callee GUIDs. If
6205 // we subsequently promote and inline those callees, we currently lose
6206 // the ability to generate this synthetic VP metadata. Optionally apply
6207 // a noinline attribute to promoted direct calls, where the threshold is
6208 // set to capture synthetic VP metadata targets which get a count of 1.
6210 Candidate.Count < MemProfICPNoInlineThreshold)
6211 DirectCall.setIsNoInline();
6212 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
6213 << ore::NV("Call", CBClone) << " in clone "
6214 << ore::NV("Caller", CBClone->getFunction())
6215 << " promoted and assigned to call function clone "
6216 << ore::NV("Callee", TargetToUse));
6217 }
6218
6219 // Update TotalCount (all clones should get same count above)
6220 TotalCount -= Candidate.Count;
6221 NumPromoted++;
6222 }
6223 // Adjust the MD.prof metadata for all clones, now that we have the new
6224 // TotalCount and the number promoted.
6225 CallBase *CBClone = CB;
6226 for (unsigned J = 0; J < NumClones; J++) {
6227 // If the VMap is empty, this clone was a duplicate of another and was
6228 // created as an alias or a declaration.
6229 if (J > 0 && VMaps[J - 1]->empty())
6230 continue;
6231 // Copy 0 is the original function.
6232 if (J > 0)
6233 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6234 // First delete the old one.
6235 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
6236 // If all promoted, we don't need the MD.prof metadata.
6237 // Otherwise we need update with the un-promoted records back.
6238 if (TotalCount != 0)
6240 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
6241 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
6242 }
6243 }
6244}
6245
6246template <typename DerivedCCG, typename FuncTy, typename CallTy>
6247bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
6248 if (DumpCCG) {
6249 dbgs() << "CCG before cloning:\n";
6250 dbgs() << *this;
6251 }
6252 if (ExportToDot)
6253 exportToDot("postbuild");
6254
6255 if (VerifyCCG) {
6256 check();
6257 }
6258
6259 identifyClones();
6260
6261 if (VerifyCCG) {
6262 check();
6263 }
6264
6265 if (DumpCCG) {
6266 dbgs() << "CCG after cloning:\n";
6267 dbgs() << *this;
6268 }
6269 if (ExportToDot)
6270 exportToDot("cloned");
6271
6272 bool Changed = assignFunctions();
6273
6274 if (DumpCCG) {
6275 dbgs() << "CCG after assigning function clones:\n";
6276 dbgs() << *this;
6277 }
6278 if (ExportToDot)
6279 exportToDot("clonefuncassign");
6280
6282 printTotalSizes(errs());
6283
6284 return Changed;
6285}
6286
6287bool MemProfContextDisambiguation::processModule(
6288 Module &M,
6289 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6290
6291 // If we have an import summary, then the cloning decisions were made during
6292 // the thin link on the index. Apply them and return.
6293 if (ImportSummary)
6294 return applyImport(M);
6295
6296 // TODO: If/when other types of memprof cloning are enabled beyond just for
6297 // hot and cold, we will need to change this to individually control the
6298 // AllocationType passed to addStackNodesForMIB during CCG construction.
6299 // Note that we specifically check this after applying imports above, so that
6300 // the option isn't needed to be passed to distributed ThinLTO backend
6301 // clang processes, which won't necessarily have visibility into the linker
6302 // dependences. Instead the information is communicated from the LTO link to
6303 // the backends via the combined summary index.
6304 if (!SupportsHotColdNew)
6305 return false;
6306
6307 ModuleCallsiteContextGraph CCG(M, OREGetter);
6308 return CCG.process();
6309}
6310
6312 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6313 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6314 // Check the dot graph printing options once here, to make sure we have valid
6315 // and expected combinations.
6316 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6318 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6320 !ContextIdForDot.getNumOccurrences())
6322 "-memprof-dot-scope=context requires -memprof-dot-context-id");
6323 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6324 ContextIdForDot.getNumOccurrences())
6326 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6327 "-memprof-dot-context-id");
6328 if (ImportSummary) {
6329 // The MemProfImportSummary should only be used for testing ThinLTO
6330 // distributed backend handling via opt, in which case we don't have a
6331 // summary from the pass pipeline.
6333 return;
6334 }
6335 if (MemProfImportSummary.empty())
6336 return;
6337
6338 auto ReadSummaryFile =
6340 if (!ReadSummaryFile) {
6341 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
6342 "Error loading file '" + MemProfImportSummary +
6343 "': ");
6344 return;
6345 }
6346 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
6347 if (!ImportSummaryForTestingOrErr) {
6348 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
6349 "Error parsing file '" + MemProfImportSummary +
6350 "': ");
6351 return;
6352 }
6353 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6354 ImportSummary = ImportSummaryForTesting.get();
6355}
6356
6359 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
6360 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6361 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
6362 };
6363 if (!processModule(M, OREGetter))
6364 return PreservedAnalyses::all();
6365 return PreservedAnalyses::none();
6366}
6367
6369 ModuleSummaryIndex &Index,
6371 isPrevailing) {
6372 // TODO: If/when other types of memprof cloning are enabled beyond just for
6373 // hot and cold, we will need to change this to individually control the
6374 // AllocationType passed to addStackNodesForMIB during CCG construction.
6375 // The index was set from the option, so these should be in sync.
6376 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6377 if (!SupportsHotColdNew)
6378 return;
6379
6380 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6381 CCG.process();
6382}
6383
6384// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
6385// when we don't have an index that has recorded that we are linking with
6386// allocation libraries containing the necessary APIs for downstream
6387// transformations.
6389 // The profile matcher applies hotness attributes directly for allocations,
6390 // and those will cause us to generate calls to the hot/cold interfaces
6391 // unconditionally. If supports-hot-cold-new was not enabled in the LTO
6392 // link then assume we don't want these calls (e.g. not linking with
6393 // the appropriate library, or otherwise trying to disable this behavior).
6394 bool Changed = false;
6395 for (auto &F : M) {
6396 for (auto &BB : F) {
6397 for (auto &I : BB) {
6398 auto *CI = dyn_cast<CallBase>(&I);
6399 if (!CI)
6400 continue;
6401 if (CI->hasFnAttr("memprof")) {
6402 CI->removeFnAttr("memprof");
6403 Changed = true;
6404 }
6405 if (!CI->hasMetadata(LLVMContext::MD_callsite)) {
6406 assert(!CI->hasMetadata(LLVMContext::MD_memprof));
6407 continue;
6408 }
6409 // Strip off all memprof metadata as it is no longer needed.
6410 // Importantly, this avoids the addition of new memprof attributes
6411 // after inlining propagation.
6412 CI->setMetadata(LLVMContext::MD_memprof, nullptr);
6413 CI->setMetadata(LLVMContext::MD_callsite, nullptr);
6414 Changed = true;
6415 }
6416 }
6417 }
6418 if (!Changed)
6419 return PreservedAnalyses::all();
6420 return PreservedAnalyses::none();
6421}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
uint64_t ComputeHash(const FunctionSummary *FS, unsigned I)
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap, FunctionSummary *FS)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
const_pointer iterator
Definition ArrayRef.h:47
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool erase(const KeyT &Val)
Definition DenseMap.h:330
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
DISubprogram * getSubprogram() const
Get the attached subprogram.
const Function & getFunction() const
Definition Function.h:164
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:598
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:77
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:328
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:93
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:161
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:669
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1317
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:145
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A class that wrap the SHA1 algorithm.
Definition SHA1.h:27
LLVM_ABI void update(ArrayRef< uint8_t > Data)
Digest more data.
Definition SHA1.cpp:208
LLVM_ABI std::array< uint8_t, 20 > result()
Return the current raw 160-bits SHA1 for the digested data since the last call to init().
Definition SHA1.cpp:288
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:695
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
uint64_t read64le(const void *P)
Definition Endian.h:435
void write32le(void *P, uint32_t V)
Definition Endian.h:475
This is an optimization pass for GlobalISel generic memory operations.
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:65
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
constexpr from_range_t from_range
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
cl::opt< unsigned > MemProfTopNImportant("memprof-top-n-important", cl::init(10), cl::Hidden, cl::desc("Number of largest cold contexts to consider important"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1867
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
cl::opt< bool > MemProfFixupImportant("memprof-fixup-important", cl::init(true), cl::Hidden, cl::desc("Enables edge fixup for important contexts"))
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34