LLVM 20.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
36#include "llvm/IR/Constants.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
45#include "llvm/Transforms/IPO.h"
47#include <deque>
48#include <sstream>
49#include <unordered_map>
50#include <vector>
51using namespace llvm;
52using namespace llvm::memprof;
53
54#define DEBUG_TYPE "memprof-context-disambiguation"
55
56STATISTIC(FunctionClonesAnalysis,
57 "Number of function clones created during whole program analysis");
58STATISTIC(FunctionClonesThinBackend,
59 "Number of function clones created during ThinLTO backend");
60STATISTIC(FunctionsClonedThinBackend,
61 "Number of functions that had clones created during ThinLTO backend");
62STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63 "cloned) during whole program analysis");
64STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65 "during whole program analysis");
66STATISTIC(AllocTypeNotColdThinBackend,
67 "Number of not cold static allocations (possibly cloned) during "
68 "ThinLTO backend");
69STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70 "(possibly cloned) during ThinLTO backend");
71STATISTIC(OrigAllocsThinBackend,
72 "Number of original (not cloned) allocations with memprof profiles "
73 "during ThinLTO backend");
75 AllocVersionsThinBackend,
76 "Number of allocation versions (including clones) during ThinLTO backend");
77STATISTIC(MaxAllocVersionsThinBackend,
78 "Maximum number of allocation versions created for an original "
79 "allocation during ThinLTO backend");
80STATISTIC(UnclonableAllocsThinBackend,
81 "Number of unclonable ambigous allocations during ThinLTO backend");
82STATISTIC(RemovedEdgesWithMismatchedCallees,
83 "Number of edges removed due to mismatched callees (profiled vs IR)");
84STATISTIC(FoundProfiledCalleeCount,
85 "Number of profiled callees found via tail calls");
86STATISTIC(FoundProfiledCalleeDepth,
87 "Aggregate depth of profiled callees found via tail calls");
88STATISTIC(FoundProfiledCalleeMaxDepth,
89 "Maximum depth of profiled callees found via tail calls");
90STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91 "Number of profiled callees found via multiple tail call chains");
92
94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95 cl::value_desc("filename"),
96 cl::desc("Specify the path prefix of the MemProf dot files."));
97
98static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
100 cl::desc("Export graph to dot files."));
101
102static cl::opt<bool>
103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104 cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106static cl::opt<bool>
107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108 cl::desc("Perform verification checks on CallingContextGraph."));
109
110static cl::opt<bool>
111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112 cl::desc("Perform frequent verification checks on nodes."));
113
115 "memprof-import-summary",
116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117 cl::Hidden);
118
120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
122 cl::desc("Max depth to recursively search for missing "
123 "frames through tail calls."));
124
125namespace llvm {
127 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
128 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
129
130// Indicate we are linking with an allocator that supports hot/cold operator
131// new interfaces.
133 "supports-hot-cold-new", cl::init(false), cl::Hidden,
134 cl::desc("Linking with hot/cold operator new interfaces"));
135} // namespace llvm
136
138
139namespace {
140/// CRTP base for graphs built from either IR or ThinLTO summary index.
141///
142/// The graph represents the call contexts in all memprof metadata on allocation
143/// calls, with nodes for the allocations themselves, as well as for the calls
144/// in each context. The graph is initially built from the allocation memprof
145/// metadata (or summary) MIBs. It is then updated to match calls with callsite
146/// metadata onto the nodes, updating it to reflect any inlining performed on
147/// those calls.
148///
149/// Each MIB (representing an allocation's call context with allocation
150/// behavior) is assigned a unique context id during the graph build. The edges
151/// and nodes in the graph are decorated with the context ids they carry. This
152/// is used to correctly update the graph when cloning is performed so that we
153/// can uniquify the context for a single (possibly cloned) allocation.
154template <typename DerivedCCG, typename FuncTy, typename CallTy>
155class CallsiteContextGraph {
156public:
157 CallsiteContextGraph() = default;
158 CallsiteContextGraph(const CallsiteContextGraph &) = default;
159 CallsiteContextGraph(CallsiteContextGraph &&) = default;
160
161 /// Main entry point to perform analysis and transformations on graph.
162 bool process();
163
164 /// Perform cloning on the graph necessary to uniquely identify the allocation
165 /// behavior of an allocation based on its context.
166 void identifyClones();
167
168 /// Assign callsite clones to functions, cloning functions as needed to
169 /// accommodate the combinations of their callsite clones reached by callers.
170 /// For regular LTO this clones functions and callsites in the IR, but for
171 /// ThinLTO the cloning decisions are noted in the summaries and later applied
172 /// in applyImport.
173 bool assignFunctions();
174
175 void dump() const;
176 void print(raw_ostream &OS) const;
177 void printTotalSizes(raw_ostream &OS) const;
178
180 const CallsiteContextGraph &CCG) {
181 CCG.print(OS);
182 return OS;
183 }
184
185 friend struct GraphTraits<
186 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
187 friend struct DOTGraphTraits<
188 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
189
190 void exportToDot(std::string Label) const;
191
192 /// Represents a function clone via FuncTy pointer and clone number pair.
193 struct FuncInfo final
194 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
195 using Base = std::pair<FuncTy *, unsigned>;
196 FuncInfo(const Base &B) : Base(B) {}
197 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
198 explicit operator bool() const { return this->first != nullptr; }
199 FuncTy *func() const { return this->first; }
200 unsigned cloneNo() const { return this->second; }
201 };
202
203 /// Represents a callsite clone via CallTy and clone number pair.
204 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
205 using Base = std::pair<CallTy, unsigned>;
206 CallInfo(const Base &B) : Base(B) {}
207 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
208 : Base(Call, CloneNo) {}
209 explicit operator bool() const { return (bool)this->first; }
210 CallTy call() const { return this->first; }
211 unsigned cloneNo() const { return this->second; }
212 void setCloneNo(unsigned N) { this->second = N; }
213 void print(raw_ostream &OS) const {
214 if (!operator bool()) {
215 assert(!cloneNo());
216 OS << "null Call";
217 return;
218 }
219 call()->print(OS);
220 OS << "\t(clone " << cloneNo() << ")";
221 }
222 void dump() const {
223 print(dbgs());
224 dbgs() << "\n";
225 }
226 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
227 Call.print(OS);
228 return OS;
229 }
230 };
231
232 struct ContextEdge;
233
234 /// Node in the Callsite Context Graph
235 struct ContextNode {
236 // Keep this for now since in the IR case where we have an Instruction* it
237 // is not as immediately discoverable. Used for printing richer information
238 // when dumping graph.
239 bool IsAllocation;
240
241 // Keeps track of when the Call was reset to null because there was
242 // recursion.
243 bool Recursive = false;
244
245 // The corresponding allocation or interior call. This is the primary call
246 // for which we have created this node.
248
249 // List of other calls that can be treated the same as the primary call
250 // through cloning. I.e. located in the same function and have the same
251 // (possibly pruned) stack ids. They will be updated the same way as the
252 // primary call when assigning to function clones.
253 std::vector<CallInfo> MatchingCalls;
254
255 // For alloc nodes this is a unique id assigned when constructed, and for
256 // callsite stack nodes it is the original stack id when the node is
257 // constructed from the memprof MIB metadata on the alloc nodes. Note that
258 // this is only used when matching callsite metadata onto the stack nodes
259 // created when processing the allocation memprof MIBs, and for labeling
260 // nodes in the dot graph. Therefore we don't bother to assign a value for
261 // clones.
262 uint64_t OrigStackOrAllocId = 0;
263
264 // This will be formed by ORing together the AllocationType enum values
265 // for contexts including this node.
266 uint8_t AllocTypes = 0;
267
268 // Edges to all callees in the profiled call stacks.
269 // TODO: Should this be a map (from Callee node) for more efficient lookup?
270 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
271
272 // Edges to all callers in the profiled call stacks.
273 // TODO: Should this be a map (from Caller node) for more efficient lookup?
274 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
275
276 // Get the list of edges from which we can compute allocation information
277 // such as the context ids and allocation type of this node.
278 const std::vector<std::shared_ptr<ContextEdge>> *
279 getEdgesWithAllocInfo() const {
280 // If node has any callees, compute from those, otherwise compute from
281 // callers (i.e. if this is the leaf allocation node).
282 if (!CalleeEdges.empty())
283 return &CalleeEdges;
284 if (!CallerEdges.empty()) {
285 // A node with caller edges but no callee edges must be the allocation
286 // node.
287 assert(IsAllocation);
288 return &CallerEdges;
289 }
290 return nullptr;
291 }
292
293 // Compute the context ids for this node from the union of its edge context
294 // ids.
295 DenseSet<uint32_t> getContextIds() const {
296 DenseSet<uint32_t> ContextIds;
297 auto *Edges = getEdgesWithAllocInfo();
298 if (!Edges)
299 return {};
300 unsigned Count = 0;
301 for (auto &Edge : *Edges)
302 Count += Edge->getContextIds().size();
303 ContextIds.reserve(Count);
304 for (auto &Edge : *Edges)
305 ContextIds.insert(Edge->getContextIds().begin(),
306 Edge->getContextIds().end());
307 return ContextIds;
308 }
309
310 // Compute the allocation type for this node from the OR of its edge
311 // allocation types.
312 uint8_t computeAllocType() const {
313 auto *Edges = getEdgesWithAllocInfo();
314 if (!Edges)
315 return (uint8_t)AllocationType::None;
316 uint8_t BothTypes =
317 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
318 uint8_t AllocType = (uint8_t)AllocationType::None;
319 for (auto &Edge : *Edges) {
320 AllocType |= Edge->AllocTypes;
321 // Bail early if alloc type reached both, no further refinement.
322 if (AllocType == BothTypes)
323 return AllocType;
324 }
325 return AllocType;
326 }
327
328 // The context ids set for this node is empty if its edge context ids are
329 // also all empty.
330 bool emptyContextIds() const {
331 auto *Edges = getEdgesWithAllocInfo();
332 if (!Edges)
333 return true;
334 for (auto &Edge : *Edges) {
335 if (!Edge->getContextIds().empty())
336 return false;
337 }
338 return true;
339 }
340
341 // List of clones of this ContextNode, initially empty.
342 std::vector<ContextNode *> Clones;
343
344 // If a clone, points to the original uncloned node.
345 ContextNode *CloneOf = nullptr;
346
347 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
348
349 ContextNode(bool IsAllocation, CallInfo C)
350 : IsAllocation(IsAllocation), Call(C) {}
351
352 void addClone(ContextNode *Clone) {
353 if (CloneOf) {
354 CloneOf->Clones.push_back(Clone);
355 Clone->CloneOf = CloneOf;
356 } else {
357 Clones.push_back(Clone);
358 assert(!Clone->CloneOf);
359 Clone->CloneOf = this;
360 }
361 }
362
363 ContextNode *getOrigNode() {
364 if (!CloneOf)
365 return this;
366 return CloneOf;
367 }
368
369 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
370 unsigned int ContextId);
371
372 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
373 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
374 void eraseCalleeEdge(const ContextEdge *Edge);
375 void eraseCallerEdge(const ContextEdge *Edge);
376
377 void setCall(CallInfo C) { Call = C; }
378
379 bool hasCall() const { return (bool)Call.call(); }
380
381 void printCall(raw_ostream &OS) const { Call.print(OS); }
382
383 // True if this node was effectively removed from the graph, in which case
384 // it should have an allocation type of None and empty context ids.
385 bool isRemoved() const {
386 assert((AllocTypes == (uint8_t)AllocationType::None) ==
387 emptyContextIds());
388 return AllocTypes == (uint8_t)AllocationType::None;
389 }
390
391 void dump() const;
392 void print(raw_ostream &OS) const;
393
394 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
395 Node.print(OS);
396 return OS;
397 }
398 };
399
400 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
401 /// callee.
402 struct ContextEdge {
403 ContextNode *Callee;
404 ContextNode *Caller;
405
406 // This will be formed by ORing together the AllocationType enum values
407 // for contexts including this edge.
408 uint8_t AllocTypes = 0;
409
410 // The set of IDs for contexts including this edge.
411 DenseSet<uint32_t> ContextIds;
412
413 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
414 DenseSet<uint32_t> ContextIds)
415 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
416 ContextIds(std::move(ContextIds)) {}
417
418 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
419
420 void dump() const;
421 void print(raw_ostream &OS) const;
422
423 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
424 Edge.print(OS);
425 return OS;
426 }
427 };
428
429 /// Helpers to remove callee edges that have allocation type None (due to not
430 /// carrying any context ids) after transformations.
431 void removeNoneTypeCalleeEdges(ContextNode *Node);
432 void
433 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
435
436protected:
437 /// Get a list of nodes corresponding to the stack ids in the given callsite
438 /// context.
439 template <class NodeT, class IteratorT>
440 std::vector<uint64_t>
441 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
442
443 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
444 /// metadata (or summary).
445 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
446
447 /// Adds nodes for the given MIB stack ids.
448 template <class NodeT, class IteratorT>
449 void addStackNodesForMIB(ContextNode *AllocNode,
450 CallStack<NodeT, IteratorT> &StackContext,
451 CallStack<NodeT, IteratorT> &CallsiteContext,
453
454 /// Matches all callsite metadata (or summary) to the nodes created for
455 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
456 /// inlining performed on those callsite instructions.
457 void updateStackNodes();
458
459 /// Update graph to conservatively handle any callsite stack nodes that target
460 /// multiple different callee target functions.
461 void handleCallsitesWithMultipleTargets();
462
463 /// Save lists of calls with MemProf metadata in each function, for faster
464 /// iteration.
465 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
466
467 /// Records the function each call is located in.
469
470 /// Map from callsite node to the enclosing caller function.
471 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
472
473private:
474 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
475
476 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
477 const FuncTy *, DenseSet<uint32_t>>;
478
479 /// Assigns the given Node to calls at or inlined into the location with
480 /// the Node's stack id, after post order traversing and processing its
481 /// caller nodes. Uses the call information recorded in the given
482 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
483 /// as needed. Called by updateStackNodes which sets up the given
484 /// StackIdToMatchingCalls map.
485 void assignStackNodesPostOrder(
486 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
487 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
488 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
489
490 /// Duplicates the given set of context ids, updating the provided
491 /// map from each original id with the newly generated context ids,
492 /// and returning the new duplicated id set.
493 DenseSet<uint32_t> duplicateContextIds(
494 const DenseSet<uint32_t> &StackSequenceContextIds,
495 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
496
497 /// Propagates all duplicated context ids across the graph.
498 void propagateDuplicateContextIds(
499 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
500
501 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
502 /// else to its callers. Also updates OrigNode's edges to remove any context
503 /// ids moved to the newly created edge.
504 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
505 bool TowardsCallee,
506 DenseSet<uint32_t> RemainingContextIds);
507
508 /// Get the stack id corresponding to the given Id or Index (for IR this will
509 /// return itself, for a summary index this will return the id recorded in the
510 /// index for that stack id index value).
511 uint64_t getStackId(uint64_t IdOrIndex) const {
512 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
513 }
514
515 /// Returns true if the given call targets the callee of the given edge, or if
516 /// we were able to identify the call chain through intermediate tail calls.
517 /// In the latter case new context nodes are added to the graph for the
518 /// identified tail calls, and their synthesized nodes are added to
519 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
520 /// the updated edges and to prepare it for an increment in the caller.
521 bool
522 calleesMatch(CallTy Call, EdgeIter &EI,
523 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
524
525 /// Returns true if the given call targets the given function, or if we were
526 /// able to identify the call chain through intermediate tail calls (in which
527 /// case FoundCalleeChain will be populated).
528 bool calleeMatchesFunc(
529 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
530 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
531 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
532 Call, Func, CallerFunc, FoundCalleeChain);
533 }
534
535 /// Get a list of nodes corresponding to the stack ids in the given
536 /// callsite's context.
537 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
538 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
539 Call);
540 }
541
542 /// Get the last stack id in the context for callsite.
543 uint64_t getLastStackId(CallTy Call) {
544 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
545 }
546
547 /// Update the allocation call to record type of allocated memory.
548 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
549 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
550 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
551 }
552
553 /// Update non-allocation call to invoke (possibly cloned) function
554 /// CalleeFunc.
555 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
556 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
557 }
558
559 /// Clone the given function for the given callsite, recording mapping of all
560 /// of the functions tracked calls to their new versions in the CallMap.
561 /// Assigns new clones to clone number CloneNo.
562 FuncInfo cloneFunctionForCallsite(
563 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
564 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
565 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
566 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
567 }
568
569 /// Gets a label to use in the dot graph for the given call clone in the given
570 /// function.
571 std::string getLabel(const FuncTy *Func, const CallTy Call,
572 unsigned CloneNo) const {
573 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
574 }
575
576 /// Helpers to find the node corresponding to the given call or stackid.
577 ContextNode *getNodeForInst(const CallInfo &C);
578 ContextNode *getNodeForAlloc(const CallInfo &C);
579 ContextNode *getNodeForStackId(uint64_t StackId);
580
581 /// Computes the alloc type corresponding to the given context ids, by
582 /// unioning their recorded alloc types.
583 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
584
585 /// Returns the allocation type of the intersection of the contexts of two
586 /// nodes (based on their provided context id sets), optimized for the case
587 /// when Node1Ids is smaller than Node2Ids.
588 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
589 const DenseSet<uint32_t> &Node2Ids);
590
591 /// Returns the allocation type of the intersection of the contexts of two
592 /// nodes (based on their provided context id sets).
593 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
594 const DenseSet<uint32_t> &Node2Ids);
595
596 /// Create a clone of Edge's callee and move Edge to that new callee node,
597 /// performing the necessary context id and allocation type updates.
598 /// If callee's caller edge iterator is supplied, it is updated when removing
599 /// the edge from that list. If ContextIdsToMove is non-empty, only that
600 /// subset of Edge's ids are moved to an edge to the new callee.
601 ContextNode *
602 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
603 EdgeIter *CallerEdgeI = nullptr,
604 DenseSet<uint32_t> ContextIdsToMove = {});
605
606 /// Change the callee of Edge to existing callee clone NewCallee, performing
607 /// the necessary context id and allocation type updates.
608 /// If callee's caller edge iterator is supplied, it is updated when removing
609 /// the edge from that list. If ContextIdsToMove is non-empty, only that
610 /// subset of Edge's ids are moved to an edge to the new callee.
611 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
612 ContextNode *NewCallee,
613 EdgeIter *CallerEdgeI = nullptr,
614 bool NewClone = false,
615 DenseSet<uint32_t> ContextIdsToMove = {});
616
617 /// Recursively perform cloning on the graph for the given Node and its
618 /// callers, in order to uniquely identify the allocation behavior of an
619 /// allocation given its context. The context ids of the allocation being
620 /// processed are given in AllocContextIds.
621 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
622 const DenseSet<uint32_t> &AllocContextIds);
623
624 /// Map from each context ID to the AllocationType assigned to that context.
625 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
626
627 /// Map from each contextID to the profiled aggregate allocation size,
628 /// optionally populated when requested (via MemProfReportHintedSizes).
629 DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;
630
631 /// Identifies the context node created for a stack id when adding the MIB
632 /// contexts to the graph. This is used to locate the context nodes when
633 /// trying to assign the corresponding callsites with those stack ids to these
634 /// nodes.
635 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
636
637 /// Maps to track the calls to their corresponding nodes in the graph.
638 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
639 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
640
641 /// Owner of all ContextNode unique_ptrs.
642 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
643
644 /// Perform sanity checks on graph when requested.
645 void check() const;
646
647 /// Keeps track of the last unique context id assigned.
648 unsigned int LastContextId = 0;
649};
650
651template <typename DerivedCCG, typename FuncTy, typename CallTy>
652using ContextNode =
653 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
654template <typename DerivedCCG, typename FuncTy, typename CallTy>
655using ContextEdge =
656 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
657template <typename DerivedCCG, typename FuncTy, typename CallTy>
658using FuncInfo =
659 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
660template <typename DerivedCCG, typename FuncTy, typename CallTy>
661using CallInfo =
662 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
663
664/// CRTP derived class for graphs built from IR (regular LTO).
665class ModuleCallsiteContextGraph
666 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
667 Instruction *> {
668public:
669 ModuleCallsiteContextGraph(
670 Module &M,
672
673private:
674 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
675 Instruction *>;
676
677 uint64_t getStackId(uint64_t IdOrIndex) const;
678 bool calleeMatchesFunc(
679 Instruction *Call, const Function *Func, const Function *CallerFunc,
680 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
681 bool findProfiledCalleeThroughTailCalls(
682 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
683 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
684 bool &FoundMultipleCalleeChains);
685 uint64_t getLastStackId(Instruction *Call);
686 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
687 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
688 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
689 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
690 Instruction *>::FuncInfo
691 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
692 std::map<CallInfo, CallInfo> &CallMap,
693 std::vector<CallInfo> &CallsWithMetadataInFunc,
694 unsigned CloneNo);
695 std::string getLabel(const Function *Func, const Instruction *Call,
696 unsigned CloneNo) const;
697
698 const Module &Mod;
700};
701
702/// Represents a call in the summary index graph, which can either be an
703/// allocation or an interior callsite node in an allocation's context.
704/// Holds a pointer to the corresponding data structure in the index.
705struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
706 IndexCall() : PointerUnion() {}
707 IndexCall(std::nullptr_t) : IndexCall() {}
709 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
710 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
711
712 IndexCall *operator->() { return this; }
713
714 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
715
716 void print(raw_ostream &OS) const {
717 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
718 OS << *AI;
719 } else {
720 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
721 assert(CI);
722 OS << *CI;
723 }
724 }
725};
726
727/// CRTP derived class for graphs built from summary index (ThinLTO).
728class IndexCallsiteContextGraph
729 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
730 IndexCall> {
731public:
732 IndexCallsiteContextGraph(
735 isPrevailing);
736
737 ~IndexCallsiteContextGraph() {
738 // Now that we are done with the graph it is safe to add the new
739 // CallsiteInfo structs to the function summary vectors. The graph nodes
740 // point into locations within these vectors, so we don't want to add them
741 // any earlier.
742 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
743 auto *FS = I.first;
744 for (auto &Callsite : I.second)
745 FS->addCallsite(*Callsite.second);
746 }
747 }
748
749private:
750 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
751 IndexCall>;
752
753 uint64_t getStackId(uint64_t IdOrIndex) const;
754 bool calleeMatchesFunc(
755 IndexCall &Call, const FunctionSummary *Func,
756 const FunctionSummary *CallerFunc,
757 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
758 bool findProfiledCalleeThroughTailCalls(
759 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
760 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
761 bool &FoundMultipleCalleeChains);
762 uint64_t getLastStackId(IndexCall &Call);
763 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
764 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
765 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
766 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
767 IndexCall>::FuncInfo
768 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
769 std::map<CallInfo, CallInfo> &CallMap,
770 std::vector<CallInfo> &CallsWithMetadataInFunc,
771 unsigned CloneNo);
772 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
773 unsigned CloneNo) const;
774
775 // Saves mapping from function summaries containing memprof records back to
776 // its VI, for use in checking and debugging.
777 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
778
781 isPrevailing;
782
783 // Saves/owns the callsite info structures synthesized for missing tail call
784 // frames that we discover while building the graph.
785 // It maps from the summary of the function making the tail call, to a map
786 // of callee ValueInfo to corresponding synthesized callsite info.
787 std::unordered_map<FunctionSummary *,
788 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
789 FunctionCalleesToSynthesizedCallsiteInfos;
790};
791} // namespace
792
793namespace llvm {
794template <>
795struct DenseMapInfo<typename CallsiteContextGraph<
796 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
798template <>
799struct DenseMapInfo<typename CallsiteContextGraph<
800 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
801 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
802template <>
803struct DenseMapInfo<IndexCall>
804 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
805} // end namespace llvm
806
807namespace {
808
809struct FieldSeparator {
810 bool Skip = true;
811 const char *Sep;
812
813 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
814};
815
816raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
817 if (FS.Skip) {
818 FS.Skip = false;
819 return OS;
820 }
821 return OS << FS.Sep;
822}
823
824// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
825// type we should actually use on the corresponding allocation.
826// If we can't clone a node that has NotCold+Cold alloc type, we will fall
827// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
828// from NotCold.
829AllocationType allocTypeToUse(uint8_t AllocTypes) {
830 assert(AllocTypes != (uint8_t)AllocationType::None);
831 if (AllocTypes ==
832 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
833 return AllocationType::NotCold;
834 else
835 return (AllocationType)AllocTypes;
836}
837
838// Helper to check if the alloc types for all edges recorded in the
839// InAllocTypes vector match the alloc types for all edges in the Edges
840// vector.
841template <typename DerivedCCG, typename FuncTy, typename CallTy>
842bool allocTypesMatch(
843 const std::vector<uint8_t> &InAllocTypes,
844 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
845 &Edges) {
846 return std::equal(
847 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
848 [](const uint8_t &l,
849 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
850 // Can share if one of the edges is None type - don't
851 // care about the type along that edge as it doesn't
852 // exist for those context ids.
853 if (l == (uint8_t)AllocationType::None ||
854 r->AllocTypes == (uint8_t)AllocationType::None)
855 return true;
856 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
857 });
858}
859
860} // end anonymous namespace
861
862template <typename DerivedCCG, typename FuncTy, typename CallTy>
863typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
864CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
865 const CallInfo &C) {
866 ContextNode *Node = getNodeForAlloc(C);
867 if (Node)
868 return Node;
869
870 return NonAllocationCallToContextNodeMap.lookup(C);
871}
872
873template <typename DerivedCCG, typename FuncTy, typename CallTy>
874typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
875CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
876 const CallInfo &C) {
877 return AllocationCallToContextNodeMap.lookup(C);
878}
879
880template <typename DerivedCCG, typename FuncTy, typename CallTy>
881typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
882CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
883 uint64_t StackId) {
884 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
885 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
886 return StackEntryNode->second;
887 return nullptr;
888}
889
890template <typename DerivedCCG, typename FuncTy, typename CallTy>
891void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
892 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
893 unsigned int ContextId) {
894 for (auto &Edge : CallerEdges) {
895 if (Edge->Caller == Caller) {
896 Edge->AllocTypes |= (uint8_t)AllocType;
897 Edge->getContextIds().insert(ContextId);
898 return;
899 }
900 }
901 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
902 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
903 CallerEdges.push_back(Edge);
904 Caller->CalleeEdges.push_back(Edge);
905}
906
907template <typename DerivedCCG, typename FuncTy, typename CallTy>
908void CallsiteContextGraph<
909 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
910 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
911 auto Edge = *EI;
912 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
913 assert(Edge->ContextIds.empty());
914 Edge->Callee->eraseCallerEdge(Edge.get());
915 EI = Node->CalleeEdges.erase(EI);
916 } else
917 ++EI;
918 }
919}
920
921template <typename DerivedCCG, typename FuncTy, typename CallTy>
922typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
923CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
924 findEdgeFromCallee(const ContextNode *Callee) {
925 for (const auto &Edge : CalleeEdges)
926 if (Edge->Callee == Callee)
927 return Edge.get();
928 return nullptr;
929}
930
931template <typename DerivedCCG, typename FuncTy, typename CallTy>
932typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
933CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
934 findEdgeFromCaller(const ContextNode *Caller) {
935 for (const auto &Edge : CallerEdges)
936 if (Edge->Caller == Caller)
937 return Edge.get();
938 return nullptr;
939}
940
941template <typename DerivedCCG, typename FuncTy, typename CallTy>
942void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
943 eraseCalleeEdge(const ContextEdge *Edge) {
944 auto EI = llvm::find_if(
945 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
946 return CalleeEdge.get() == Edge;
947 });
948 assert(EI != CalleeEdges.end());
949 CalleeEdges.erase(EI);
950}
951
952template <typename DerivedCCG, typename FuncTy, typename CallTy>
953void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
954 eraseCallerEdge(const ContextEdge *Edge) {
955 auto EI = llvm::find_if(
956 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
957 return CallerEdge.get() == Edge;
958 });
959 assert(EI != CallerEdges.end());
960 CallerEdges.erase(EI);
961}
962
963template <typename DerivedCCG, typename FuncTy, typename CallTy>
964uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
965 DenseSet<uint32_t> &ContextIds) {
966 uint8_t BothTypes =
967 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
968 uint8_t AllocType = (uint8_t)AllocationType::None;
969 for (auto Id : ContextIds) {
970 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
971 // Bail early if alloc type reached both, no further refinement.
972 if (AllocType == BothTypes)
973 return AllocType;
974 }
975 return AllocType;
976}
977
978template <typename DerivedCCG, typename FuncTy, typename CallTy>
979uint8_t
980CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
981 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
982 uint8_t BothTypes =
983 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
984 uint8_t AllocType = (uint8_t)AllocationType::None;
985 for (auto Id : Node1Ids) {
986 if (!Node2Ids.count(Id))
987 continue;
988 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
989 // Bail early if alloc type reached both, no further refinement.
990 if (AllocType == BothTypes)
991 return AllocType;
992 }
993 return AllocType;
994}
995
996template <typename DerivedCCG, typename FuncTy, typename CallTy>
997uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
998 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
999 if (Node1Ids.size() < Node2Ids.size())
1000 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1001 else
1002 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1003}
1004
1005template <typename DerivedCCG, typename FuncTy, typename CallTy>
1006typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1007CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1008 CallInfo Call, const FuncTy *F) {
1009 assert(!getNodeForAlloc(Call));
1010 NodeOwner.push_back(
1011 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
1012 ContextNode *AllocNode = NodeOwner.back().get();
1013 AllocationCallToContextNodeMap[Call] = AllocNode;
1014 NodeToCallingFunc[AllocNode] = F;
1015 // Use LastContextId as a uniq id for MIB allocation nodes.
1016 AllocNode->OrigStackOrAllocId = LastContextId;
1017 // Alloc type should be updated as we add in the MIBs. We should assert
1018 // afterwards that it is not still None.
1019 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1020
1021 return AllocNode;
1022}
1023
1024static std::string getAllocTypeString(uint8_t AllocTypes) {
1025 if (!AllocTypes)
1026 return "None";
1027 std::string Str;
1028 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1029 Str += "NotCold";
1030 if (AllocTypes & (uint8_t)AllocationType::Cold)
1031 Str += "Cold";
1032 return Str;
1033}
1034
1035template <typename DerivedCCG, typename FuncTy, typename CallTy>
1036template <class NodeT, class IteratorT>
1037void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1038 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1040 uint64_t TotalSize) {
1041 assert(!MemProfReportHintedSizes || TotalSize > 0);
1042 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1043 // is done.
1044 if (AllocType == AllocationType::Hot)
1045 AllocType = AllocationType::NotCold;
1046
1047 ContextIdToAllocationType[++LastContextId] = AllocType;
1048
1050 assert(TotalSize);
1051 ContextIdToTotalSize[LastContextId] = TotalSize;
1052 }
1053
1054 // Update alloc type and context ids for this MIB.
1055 AllocNode->AllocTypes |= (uint8_t)AllocType;
1056
1057 // Now add or update nodes for each stack id in alloc's context.
1058 // Later when processing the stack ids on non-alloc callsites we will adjust
1059 // for any inlining in the context.
1060 ContextNode *PrevNode = AllocNode;
1061 // Look for recursion (direct recursion should have been collapsed by
1062 // module summary analysis, here we should just be detecting mutual
1063 // recursion). Mark these nodes so we don't try to clone.
1064 SmallSet<uint64_t, 8> StackIdSet;
1065 // Skip any on the allocation call (inlining).
1066 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1067 ContextIter != StackContext.end(); ++ContextIter) {
1068 auto StackId = getStackId(*ContextIter);
1069 ContextNode *StackNode = getNodeForStackId(StackId);
1070 if (!StackNode) {
1071 NodeOwner.push_back(
1072 std::make_unique<ContextNode>(/*IsAllocation=*/false));
1073 StackNode = NodeOwner.back().get();
1074 StackEntryIdToContextNodeMap[StackId] = StackNode;
1075 StackNode->OrigStackOrAllocId = StackId;
1076 }
1077 auto Ins = StackIdSet.insert(StackId);
1078 if (!Ins.second)
1079 StackNode->Recursive = true;
1080 StackNode->AllocTypes |= (uint8_t)AllocType;
1081 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1082 PrevNode = StackNode;
1083 }
1084}
1085
1086template <typename DerivedCCG, typename FuncTy, typename CallTy>
1088CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1089 const DenseSet<uint32_t> &StackSequenceContextIds,
1090 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1091 DenseSet<uint32_t> NewContextIds;
1092 for (auto OldId : StackSequenceContextIds) {
1093 NewContextIds.insert(++LastContextId);
1094 OldToNewContextIds[OldId].insert(LastContextId);
1095 assert(ContextIdToAllocationType.count(OldId));
1096 // The new context has the same allocation type as original.
1097 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1098 // For now set this to 0 so we don't duplicate sizes. Not clear how to divvy
1099 // up the size. Assume that if we are able to duplicate context ids that we
1100 // will be able to disambiguate all copies.
1101 ContextIdToTotalSize[LastContextId] = 0;
1102 }
1103 return NewContextIds;
1104}
1105
1106template <typename DerivedCCG, typename FuncTy, typename CallTy>
1107void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1108 propagateDuplicateContextIds(
1109 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1110 // Build a set of duplicated context ids corresponding to the input id set.
1111 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1112 DenseSet<uint32_t> NewIds;
1113 for (auto Id : ContextIds)
1114 if (auto NewId = OldToNewContextIds.find(Id);
1115 NewId != OldToNewContextIds.end())
1116 NewIds.insert(NewId->second.begin(), NewId->second.end());
1117 return NewIds;
1118 };
1119
1120 // Recursively update context ids sets along caller edges.
1121 auto UpdateCallers = [&](ContextNode *Node,
1123 auto &&UpdateCallers) -> void {
1124 for (const auto &Edge : Node->CallerEdges) {
1125 auto Inserted = Visited.insert(Edge.get());
1126 if (!Inserted.second)
1127 continue;
1128 ContextNode *NextNode = Edge->Caller;
1129 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1130 // Only need to recursively iterate to NextNode via this caller edge if
1131 // it resulted in any added ids to NextNode.
1132 if (!NewIdsToAdd.empty()) {
1133 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1134 UpdateCallers(NextNode, Visited, UpdateCallers);
1135 }
1136 }
1137 };
1138
1140 for (auto &Entry : AllocationCallToContextNodeMap) {
1141 auto *Node = Entry.second;
1142 UpdateCallers(Node, Visited, UpdateCallers);
1143 }
1144}
1145
1146template <typename DerivedCCG, typename FuncTy, typename CallTy>
1147void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1148 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1149 // This must be passed by value to make a copy since it will be adjusted
1150 // as ids are moved.
1151 DenseSet<uint32_t> RemainingContextIds) {
1152 auto &OrigEdges =
1153 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1154 // Increment iterator in loop so that we can remove edges as needed.
1155 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1156 auto Edge = *EI;
1157 // Remove any matching context ids from Edge, return set that were found and
1158 // removed, these are the new edge's context ids. Also update the remaining
1159 // (not found ids).
1160 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1161 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1162 NotFoundContextIds);
1163 RemainingContextIds.swap(NotFoundContextIds);
1164 // If no matching context ids for this edge, skip it.
1165 if (NewEdgeContextIds.empty()) {
1166 ++EI;
1167 continue;
1168 }
1169 if (TowardsCallee) {
1170 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1171 auto NewEdge = std::make_shared<ContextEdge>(
1172 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1173 NewNode->CalleeEdges.push_back(NewEdge);
1174 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1175 } else {
1176 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1177 auto NewEdge = std::make_shared<ContextEdge>(
1178 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1179 NewNode->CallerEdges.push_back(NewEdge);
1180 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1181 }
1182 // Remove old edge if context ids empty.
1183 if (Edge->getContextIds().empty()) {
1184 if (TowardsCallee) {
1185 Edge->Callee->eraseCallerEdge(Edge.get());
1186 EI = OrigNode->CalleeEdges.erase(EI);
1187 } else {
1188 Edge->Caller->eraseCalleeEdge(Edge.get());
1189 EI = OrigNode->CallerEdges.erase(EI);
1190 }
1191 continue;
1192 }
1193 ++EI;
1194 }
1195}
1196
1197template <typename DerivedCCG, typename FuncTy, typename CallTy>
1198static void checkEdge(
1199 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1200 // Confirm that alloc type is not None and that we have at least one context
1201 // id.
1202 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1203 assert(!Edge->ContextIds.empty());
1204}
1205
1206template <typename DerivedCCG, typename FuncTy, typename CallTy>
1207static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1208 bool CheckEdges = true) {
1209 if (Node->isRemoved())
1210 return;
1211#ifndef NDEBUG
1212 // Compute node's context ids once for use in asserts.
1213 auto NodeContextIds = Node->getContextIds();
1214#endif
1215 // Node's context ids should be the union of both its callee and caller edge
1216 // context ids.
1217 if (Node->CallerEdges.size()) {
1218 DenseSet<uint32_t> CallerEdgeContextIds(
1219 Node->CallerEdges.front()->ContextIds);
1220 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1221 if (CheckEdges)
1222 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1223 set_union(CallerEdgeContextIds, Edge->ContextIds);
1224 }
1225 // Node can have more context ids than callers if some contexts terminate at
1226 // node and some are longer.
1227 assert(NodeContextIds == CallerEdgeContextIds ||
1228 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1229 }
1230 if (Node->CalleeEdges.size()) {
1231 DenseSet<uint32_t> CalleeEdgeContextIds(
1232 Node->CalleeEdges.front()->ContextIds);
1233 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1234 if (CheckEdges)
1235 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1236 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1237 }
1238 assert(NodeContextIds == CalleeEdgeContextIds);
1239 }
1240}
1241
1242template <typename DerivedCCG, typename FuncTy, typename CallTy>
1243void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1244 assignStackNodesPostOrder(
1245 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1246 DenseMap<uint64_t, std::vector<CallContextInfo>>
1247 &StackIdToMatchingCalls,
1248 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1249 auto Inserted = Visited.insert(Node);
1250 if (!Inserted.second)
1251 return;
1252 // Post order traversal. Iterate over a copy since we may add nodes and
1253 // therefore new callers during the recursive call, invalidating any
1254 // iterator over the original edge vector. We don't need to process these
1255 // new nodes as they were already processed on creation.
1256 auto CallerEdges = Node->CallerEdges;
1257 for (auto &Edge : CallerEdges) {
1258 // Skip any that have been removed during the recursion.
1259 if (!Edge)
1260 continue;
1261 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1262 CallToMatchingCall);
1263 }
1264
1265 // If this node's stack id is in the map, update the graph to contain new
1266 // nodes representing any inlining at interior callsites. Note we move the
1267 // associated context ids over to the new nodes.
1268
1269 // Ignore this node if it is for an allocation or we didn't record any
1270 // stack id lists ending at it.
1271 if (Node->IsAllocation ||
1272 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1273 return;
1274
1275 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1276 // Handle the simple case first. A single call with a single stack id.
1277 // In this case there is no need to create any new context nodes, simply
1278 // assign the context node for stack id to this Call.
1279 if (Calls.size() == 1) {
1280 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1281 if (Ids.size() == 1) {
1282 assert(SavedContextIds.empty());
1283 // It should be this Node
1284 assert(Node == getNodeForStackId(Ids[0]));
1285 if (Node->Recursive)
1286 return;
1287 Node->setCall(Call);
1288 NonAllocationCallToContextNodeMap[Call] = Node;
1289 NodeToCallingFunc[Node] = Func;
1290 return;
1291 }
1292 }
1293
1294 // Find the node for the last stack id, which should be the same
1295 // across all calls recorded for this id, and is this node's id.
1296 uint64_t LastId = Node->OrigStackOrAllocId;
1297 ContextNode *LastNode = getNodeForStackId(LastId);
1298 // We should only have kept stack ids that had nodes.
1299 assert(LastNode);
1300
1301 for (unsigned I = 0; I < Calls.size(); I++) {
1302 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1303 // Skip any for which we didn't assign any ids, these don't get a node in
1304 // the graph.
1305 if (SavedContextIds.empty()) {
1306 // If this call has a matching call (located in the same function and
1307 // having the same stack ids), simply add it to the context node created
1308 // for its matching call earlier. These can be treated the same through
1309 // cloning and get updated at the same time.
1310 if (!CallToMatchingCall.contains(Call))
1311 continue;
1312 auto MatchingCall = CallToMatchingCall[Call];
1313 assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
1314 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1315 Call);
1316 continue;
1317 }
1318
1319 assert(LastId == Ids.back());
1320
1321 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1322 assert(FirstNode);
1323
1324 // Recompute the context ids for this stack id sequence (the
1325 // intersection of the context ids of the corresponding nodes).
1326 // Start with the ids we saved in the map for this call, which could be
1327 // duplicated context ids. We have to recompute as we might have overlap
1328 // overlap between the saved context ids for different last nodes, and
1329 // removed them already during the post order traversal.
1330 set_intersect(SavedContextIds, FirstNode->getContextIds());
1331 ContextNode *PrevNode = nullptr;
1332 for (auto Id : Ids) {
1333 ContextNode *CurNode = getNodeForStackId(Id);
1334 // We should only have kept stack ids that had nodes and weren't
1335 // recursive.
1336 assert(CurNode);
1337 assert(!CurNode->Recursive);
1338 if (!PrevNode) {
1339 PrevNode = CurNode;
1340 continue;
1341 }
1342 auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1343 if (!Edge) {
1344 SavedContextIds.clear();
1345 break;
1346 }
1347 PrevNode = CurNode;
1348 set_intersect(SavedContextIds, Edge->getContextIds());
1349
1350 // If we now have no context ids for clone, skip this call.
1351 if (SavedContextIds.empty())
1352 break;
1353 }
1354 if (SavedContextIds.empty())
1355 continue;
1356
1357 // Create new context node.
1358 NodeOwner.push_back(
1359 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1360 ContextNode *NewNode = NodeOwner.back().get();
1361 NodeToCallingFunc[NewNode] = Func;
1362 NonAllocationCallToContextNodeMap[Call] = NewNode;
1363 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1364
1365 // Connect to callees of innermost stack frame in inlined call chain.
1366 // This updates context ids for FirstNode's callee's to reflect those
1367 // moved to NewNode.
1368 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1369
1370 // Connect to callers of outermost stack frame in inlined call chain.
1371 // This updates context ids for FirstNode's caller's to reflect those
1372 // moved to NewNode.
1373 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1374
1375 // Now we need to remove context ids from edges/nodes between First and
1376 // Last Node.
1377 PrevNode = nullptr;
1378 for (auto Id : Ids) {
1379 ContextNode *CurNode = getNodeForStackId(Id);
1380 // We should only have kept stack ids that had nodes.
1381 assert(CurNode);
1382
1383 // Remove the context ids moved to NewNode from CurNode, and the
1384 // edge from the prior node.
1385 if (PrevNode) {
1386 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1387 assert(PrevEdge);
1388 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1389 if (PrevEdge->getContextIds().empty()) {
1390 PrevNode->eraseCallerEdge(PrevEdge);
1391 CurNode->eraseCalleeEdge(PrevEdge);
1392 }
1393 }
1394 // Since we update the edges from leaf to tail, only look at the callee
1395 // edges. This isn't an alloc node, so if there are no callee edges, the
1396 // alloc type is None.
1397 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1398 ? (uint8_t)AllocationType::None
1399 : CurNode->computeAllocType();
1400 PrevNode = CurNode;
1401 }
1402 if (VerifyNodes) {
1403 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1404 for (auto Id : Ids) {
1405 ContextNode *CurNode = getNodeForStackId(Id);
1406 // We should only have kept stack ids that had nodes.
1407 assert(CurNode);
1408 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1409 }
1410 }
1411 }
1412}
1413
1414template <typename DerivedCCG, typename FuncTy, typename CallTy>
1415void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1416 // Map of stack id to all calls with that as the last (outermost caller)
1417 // callsite id that has a context node (some might not due to pruning
1418 // performed during matching of the allocation profile contexts).
1419 // The CallContextInfo contains the Call and a list of its stack ids with
1420 // ContextNodes, the function containing Call, and the set of context ids
1421 // the analysis will eventually identify for use in any new node created
1422 // for that callsite.
1424 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1425 for (auto &Call : CallsWithMetadata) {
1426 // Ignore allocations, already handled.
1427 if (AllocationCallToContextNodeMap.count(Call))
1428 continue;
1429 auto StackIdsWithContextNodes =
1430 getStackIdsWithContextNodesForCall(Call.call());
1431 // If there were no nodes created for MIBs on allocs (maybe this was in
1432 // the unambiguous part of the MIB stack that was pruned), ignore.
1433 if (StackIdsWithContextNodes.empty())
1434 continue;
1435 // Otherwise, record this Call along with the list of ids for the last
1436 // (outermost caller) stack id with a node.
1437 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1438 {Call.call(), StackIdsWithContextNodes, Func, {}});
1439 }
1440 }
1441
1442 // First make a pass through all stack ids that correspond to a call,
1443 // as identified in the above loop. Compute the context ids corresponding to
1444 // each of these calls when they correspond to multiple stack ids due to
1445 // due to inlining. Perform any duplication of context ids required when
1446 // there is more than one call with the same stack ids. Their (possibly newly
1447 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1448 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1449 // Save a map from each call to any that are found to match it. I.e. located
1450 // in the same function and have the same (possibly pruned) stack ids. We use
1451 // this to avoid creating extra graph nodes as they can be treated the same.
1452 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1453 for (auto &It : StackIdToMatchingCalls) {
1454 auto &Calls = It.getSecond();
1455 // Skip single calls with a single stack id. These don't need a new node.
1456 if (Calls.size() == 1) {
1457 auto &Ids = std::get<1>(Calls[0]);
1458 if (Ids.size() == 1)
1459 continue;
1460 }
1461 // In order to do the best and maximal matching of inlined calls to context
1462 // node sequences we will sort the vectors of stack ids in descending order
1463 // of length, and within each length, lexicographically by stack id. The
1464 // latter is so that we can specially handle calls that have identical stack
1465 // id sequences (either due to cloning or artificially because of the MIB
1466 // context pruning).
1467 std::stable_sort(Calls.begin(), Calls.end(),
1468 [](const CallContextInfo &A, const CallContextInfo &B) {
1469 auto &IdsA = std::get<1>(A);
1470 auto &IdsB = std::get<1>(B);
1471 return IdsA.size() > IdsB.size() ||
1472 (IdsA.size() == IdsB.size() && IdsA < IdsB);
1473 });
1474
1475 // Find the node for the last stack id, which should be the same
1476 // across all calls recorded for this id, and is the id for this
1477 // entry in the StackIdToMatchingCalls map.
1478 uint64_t LastId = It.getFirst();
1479 ContextNode *LastNode = getNodeForStackId(LastId);
1480 // We should only have kept stack ids that had nodes.
1481 assert(LastNode);
1482
1483 if (LastNode->Recursive)
1484 continue;
1485
1486 // Initialize the context ids with the last node's. We will subsequently
1487 // refine the context ids by computing the intersection along all edges.
1488 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1489 assert(!LastNodeContextIds.empty());
1490
1491 // Map from function to the first call from the below list (with matching
1492 // stack ids) found in that function. Note that calls from different
1493 // functions can have the same stack ids because this is the list of stack
1494 // ids that had (possibly pruned) nodes after building the graph from the
1495 // allocation MIBs.
1497
1498 for (unsigned I = 0; I < Calls.size(); I++) {
1499 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1500 assert(SavedContextIds.empty());
1501 assert(LastId == Ids.back());
1502
1503 // First compute the context ids for this stack id sequence (the
1504 // intersection of the context ids of the corresponding nodes).
1505 // Start with the remaining saved ids for the last node.
1506 assert(!LastNodeContextIds.empty());
1507 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1508
1509 ContextNode *PrevNode = LastNode;
1510 ContextNode *CurNode = LastNode;
1511 bool Skip = false;
1512
1513 // Iterate backwards through the stack Ids, starting after the last Id
1514 // in the list, which was handled once outside for all Calls.
1515 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1516 auto Id = *IdIter;
1517 CurNode = getNodeForStackId(Id);
1518 // We should only have kept stack ids that had nodes.
1519 assert(CurNode);
1520
1521 if (CurNode->Recursive) {
1522 Skip = true;
1523 break;
1524 }
1525
1526 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1527 // If there is no edge then the nodes belong to different MIB contexts,
1528 // and we should skip this inlined context sequence. For example, this
1529 // particular inlined context may include stack ids A->B, and we may
1530 // indeed have nodes for both A and B, but it is possible that they were
1531 // never profiled in sequence in a single MIB for any allocation (i.e.
1532 // we might have profiled an allocation that involves the callsite A,
1533 // but through a different one of its callee callsites, and we might
1534 // have profiled an allocation that involves callsite B, but reached
1535 // from a different caller callsite).
1536 if (!Edge) {
1537 Skip = true;
1538 break;
1539 }
1540 PrevNode = CurNode;
1541
1542 // Update the context ids, which is the intersection of the ids along
1543 // all edges in the sequence.
1544 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1545
1546 // If we now have no context ids for clone, skip this call.
1547 if (StackSequenceContextIds.empty()) {
1548 Skip = true;
1549 break;
1550 }
1551 }
1552 if (Skip)
1553 continue;
1554
1555 // If some of this call's stack ids did not have corresponding nodes (due
1556 // to pruning), don't include any context ids for contexts that extend
1557 // beyond these nodes. Otherwise we would be matching part of unrelated /
1558 // not fully matching stack contexts. To do this, subtract any context ids
1559 // found in caller nodes of the last node found above.
1560 if (Ids.back() != getLastStackId(Call)) {
1561 for (const auto &PE : LastNode->CallerEdges) {
1562 set_subtract(StackSequenceContextIds, PE->getContextIds());
1563 if (StackSequenceContextIds.empty())
1564 break;
1565 }
1566 // If we now have no context ids for clone, skip this call.
1567 if (StackSequenceContextIds.empty())
1568 continue;
1569 }
1570
1571 const FuncTy *CallFunc = CallToFunc[Call];
1572
1573 // If the prior call had the same stack ids this map would not be empty.
1574 // Check if we already have a call that "matches" because it is located
1575 // in the same function.
1576 if (FuncToCallMap.contains(CallFunc)) {
1577 // Record the matching call found for this call, and skip it. We
1578 // will subsequently combine it into the same node.
1579 CallToMatchingCall[Call] = FuncToCallMap[CallFunc];
1580 continue;
1581 }
1582
1583 // Check if the next set of stack ids is the same (since the Calls vector
1584 // of tuples is sorted by the stack ids we can just look at the next one).
1585 bool DuplicateContextIds = false;
1586 if (I + 1 < Calls.size()) {
1587 auto NextIds = std::get<1>(Calls[I + 1]);
1588 DuplicateContextIds = Ids == NextIds;
1589 }
1590
1591 // If we don't have duplicate context ids, then we can assign all the
1592 // context ids computed for the original node sequence to this call.
1593 // If there are duplicate calls with the same stack ids then we synthesize
1594 // new context ids that are duplicates of the originals. These are
1595 // assigned to SavedContextIds, which is a reference into the map entry
1596 // for this call, allowing us to access these ids later on.
1597 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1598 StackSequenceContextIds.size());
1599 SavedContextIds =
1600 DuplicateContextIds
1601 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1602 : StackSequenceContextIds;
1603 assert(!SavedContextIds.empty());
1604
1605 if (!DuplicateContextIds) {
1606 // Update saved last node's context ids to remove those that are
1607 // assigned to other calls, so that it is ready for the next call at
1608 // this stack id.
1609 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1610 if (LastNodeContextIds.empty())
1611 break;
1612 // No longer possibly in a sequence of calls with duplicate stack ids,
1613 // clear the map.
1614 FuncToCallMap.clear();
1615 } else
1616 // Record the call with its function, so we can locate it the next time
1617 // we find a call from this function when processing the calls with the
1618 // same stack ids.
1619 FuncToCallMap[CallFunc] = Call;
1620 }
1621 }
1622
1623 // Propagate the duplicate context ids over the graph.
1624 propagateDuplicateContextIds(OldToNewContextIds);
1625
1626 if (VerifyCCG)
1627 check();
1628
1629 // Now perform a post-order traversal over the graph, starting with the
1630 // allocation nodes, essentially processing nodes from callers to callees.
1631 // For any that contains an id in the map, update the graph to contain new
1632 // nodes representing any inlining at interior callsites. Note we move the
1633 // associated context ids over to the new nodes.
1635 for (auto &Entry : AllocationCallToContextNodeMap)
1636 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
1637 CallToMatchingCall);
1638 if (VerifyCCG)
1639 check();
1640}
1641
1642uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1644 Call->getMetadata(LLVMContext::MD_callsite));
1645 return CallsiteContext.back();
1646}
1647
1648uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1649 assert(isa<CallsiteInfo *>(Call.getBase()));
1651 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1652 // Need to convert index into stack id.
1653 return Index.getStackIdAtIndex(CallsiteContext.back());
1654}
1655
1656static const std::string MemProfCloneSuffix = ".memprof.";
1657
1658static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1659 // We use CloneNo == 0 to refer to the original version, which doesn't get
1660 // renamed with a suffix.
1661 if (!CloneNo)
1662 return Base.str();
1663 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1664}
1665
1666std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1667 const Instruction *Call,
1668 unsigned CloneNo) const {
1669 return (Twine(Call->getFunction()->getName()) + " -> " +
1670 cast<CallBase>(Call)->getCalledFunction()->getName())
1671 .str();
1672}
1673
1674std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1675 const IndexCall &Call,
1676 unsigned CloneNo) const {
1677 auto VI = FSToVIMap.find(Func);
1678 assert(VI != FSToVIMap.end());
1679 if (isa<AllocInfo *>(Call.getBase()))
1680 return (VI->second.name() + " -> alloc").str();
1681 else {
1682 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1683 return (VI->second.name() + " -> " +
1684 getMemProfFuncName(Callsite->Callee.name(),
1685 Callsite->Clones[CloneNo]))
1686 .str();
1687 }
1688}
1689
1690std::vector<uint64_t>
1691ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1692 Instruction *Call) {
1694 Call->getMetadata(LLVMContext::MD_callsite));
1695 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1696 CallsiteContext);
1697}
1698
1699std::vector<uint64_t>
1700IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1701 assert(isa<CallsiteInfo *>(Call.getBase()));
1703 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1704 return getStackIdsWithContextNodes<CallsiteInfo,
1706 CallsiteContext);
1707}
1708
1709template <typename DerivedCCG, typename FuncTy, typename CallTy>
1710template <class NodeT, class IteratorT>
1711std::vector<uint64_t>
1712CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1713 CallStack<NodeT, IteratorT> &CallsiteContext) {
1714 std::vector<uint64_t> StackIds;
1715 for (auto IdOrIndex : CallsiteContext) {
1716 auto StackId = getStackId(IdOrIndex);
1717 ContextNode *Node = getNodeForStackId(StackId);
1718 if (!Node)
1719 break;
1720 StackIds.push_back(StackId);
1721 }
1722 return StackIds;
1723}
1724
1725ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1726 Module &M,
1728 : Mod(M), OREGetter(OREGetter) {
1729 for (auto &F : M) {
1730 std::vector<CallInfo> CallsWithMetadata;
1731 for (auto &BB : F) {
1732 for (auto &I : BB) {
1733 if (!isa<CallBase>(I))
1734 continue;
1735 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1736 CallsWithMetadata.push_back(&I);
1737 CallToFunc[&I] = &F;
1738 auto *AllocNode = addAllocNode(&I, &F);
1739 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1740 assert(CallsiteMD);
1741 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1742 // Add all of the MIBs and their stack nodes.
1743 for (auto &MDOp : MemProfMD->operands()) {
1744 auto *MIBMD = cast<const MDNode>(MDOp);
1748 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1749 AllocNode, StackContext, CallsiteContext,
1750 getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));
1751 }
1752 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1753 // Memprof and callsite metadata on memory allocations no longer
1754 // needed.
1755 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1756 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1757 }
1758 // For callsite metadata, add to list for this function for later use.
1759 else if (I.getMetadata(LLVMContext::MD_callsite)) {
1760 CallsWithMetadata.push_back(&I);
1761 CallToFunc[&I] = &F;
1762 }
1763 }
1764 }
1765 if (!CallsWithMetadata.empty())
1766 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1767 }
1768
1769 if (DumpCCG) {
1770 dbgs() << "CCG before updating call stack chains:\n";
1771 dbgs() << *this;
1772 }
1773
1774 if (ExportToDot)
1775 exportToDot("prestackupdate");
1776
1777 updateStackNodes();
1778
1779 handleCallsitesWithMultipleTargets();
1780
1781 // Strip off remaining callsite metadata, no longer needed.
1782 for (auto &FuncEntry : FuncToCallsWithMetadata)
1783 for (auto &Call : FuncEntry.second)
1784 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1785}
1786
1787IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1790 isPrevailing)
1791 : Index(Index), isPrevailing(isPrevailing) {
1792 for (auto &I : Index) {
1793 auto VI = Index.getValueInfo(I);
1794 for (auto &S : VI.getSummaryList()) {
1795 // We should only add the prevailing nodes. Otherwise we may try to clone
1796 // in a weak copy that won't be linked (and may be different than the
1797 // prevailing version).
1798 // We only keep the memprof summary on the prevailing copy now when
1799 // building the combined index, as a space optimization, however don't
1800 // rely on this optimization. The linker doesn't resolve local linkage
1801 // values so don't check whether those are prevailing.
1802 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1803 !isPrevailing(VI.getGUID(), S.get()))
1804 continue;
1805 auto *FS = dyn_cast<FunctionSummary>(S.get());
1806 if (!FS)
1807 continue;
1808 std::vector<CallInfo> CallsWithMetadata;
1809 if (!FS->allocs().empty()) {
1810 for (auto &AN : FS->mutableAllocs()) {
1811 // This can happen because of recursion elimination handling that
1812 // currently exists in ModuleSummaryAnalysis. Skip these for now.
1813 // We still added them to the summary because we need to be able to
1814 // correlate properly in applyImport in the backends.
1815 if (AN.MIBs.empty())
1816 continue;
1817 IndexCall AllocCall(&AN);
1818 CallsWithMetadata.push_back(AllocCall);
1819 CallToFunc[AllocCall] = FS;
1820 auto *AllocNode = addAllocNode(AllocCall, FS);
1821 // Pass an empty CallStack to the CallsiteContext (second)
1822 // parameter, since for ThinLTO we already collapsed out the inlined
1823 // stack ids on the allocation call during ModuleSummaryAnalysis.
1825 EmptyContext;
1826 unsigned I = 0;
1828 AN.TotalSizes.size() == AN.MIBs.size());
1829 // Now add all of the MIBs and their stack nodes.
1830 for (auto &MIB : AN.MIBs) {
1832 StackContext(&MIB);
1833 uint64_t TotalSize = 0;
1835 TotalSize = AN.TotalSizes[I];
1836 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1837 AllocNode, StackContext, EmptyContext, MIB.AllocType,
1838 TotalSize);
1839 I++;
1840 }
1841 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1842 // Initialize version 0 on the summary alloc node to the current alloc
1843 // type, unless it has both types in which case make it default, so
1844 // that in the case where we aren't able to clone the original version
1845 // always ends up with the default allocation behavior.
1846 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1847 }
1848 }
1849 // For callsite metadata, add to list for this function for later use.
1850 if (!FS->callsites().empty())
1851 for (auto &SN : FS->mutableCallsites()) {
1852 IndexCall StackNodeCall(&SN);
1853 CallsWithMetadata.push_back(StackNodeCall);
1854 CallToFunc[StackNodeCall] = FS;
1855 }
1856
1857 if (!CallsWithMetadata.empty())
1858 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1859
1860 if (!FS->allocs().empty() || !FS->callsites().empty())
1861 FSToVIMap[FS] = VI;
1862 }
1863 }
1864
1865 if (DumpCCG) {
1866 dbgs() << "CCG before updating call stack chains:\n";
1867 dbgs() << *this;
1868 }
1869
1870 if (ExportToDot)
1871 exportToDot("prestackupdate");
1872
1873 updateStackNodes();
1874
1875 handleCallsitesWithMultipleTargets();
1876}
1877
1878template <typename DerivedCCG, typename FuncTy, typename CallTy>
1879void CallsiteContextGraph<DerivedCCG, FuncTy,
1880 CallTy>::handleCallsitesWithMultipleTargets() {
1881 // Look for and workaround callsites that call multiple functions.
1882 // This can happen for indirect calls, which needs better handling, and in
1883 // more rare cases (e.g. macro expansion).
1884 // TODO: To fix this for indirect calls we will want to perform speculative
1885 // devirtualization using either the normal PGO info with ICP, or using the
1886 // information in the profiled MemProf contexts. We can do this prior to
1887 // this transformation for regular LTO, and for ThinLTO we can simulate that
1888 // effect in the summary and perform the actual speculative devirtualization
1889 // while cloning in the ThinLTO backend.
1890
1891 // Keep track of the new nodes synthesized for discovered tail calls missing
1892 // from the profiled contexts.
1893 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1894
1895 for (auto &Entry : NonAllocationCallToContextNodeMap) {
1896 auto *Node = Entry.second;
1897 assert(Node->Clones.empty());
1898 // Check all node callees and see if in the same function.
1899 auto Call = Node->Call.call();
1900 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
1901 ++EI) {
1902 auto Edge = *EI;
1903 if (!Edge->Callee->hasCall())
1904 continue;
1905 assert(NodeToCallingFunc.count(Edge->Callee));
1906 // Check if the called function matches that of the callee node.
1907 if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1908 continue;
1909 RemovedEdgesWithMismatchedCallees++;
1910 // Work around by setting Node to have a null call, so it gets
1911 // skipped during cloning. Otherwise assignFunctions will assert
1912 // because its data structures are not designed to handle this case.
1913 Node->setCall(CallInfo());
1914 break;
1915 }
1916 }
1917
1918 // Remove all mismatched nodes identified in the above loop from the node map
1919 // (checking whether they have a null call which is set above). For a
1920 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
1921 // to do the removal via remove_if than by individually erasing entries above.
1922 NonAllocationCallToContextNodeMap.remove_if(
1923 [](const auto &it) { return !it.second->hasCall(); });
1924
1925 // Add the new nodes after the above loop so that the iteration is not
1926 // invalidated.
1927 for (auto &[Call, Node] : TailCallToContextNodeMap)
1928 NonAllocationCallToContextNodeMap[Call] = Node;
1929}
1930
1931uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1932 // In the Module (IR) case this is already the Id.
1933 return IdOrIndex;
1934}
1935
1936uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1937 // In the Index case this is an index into the stack id list in the summary
1938 // index, convert it to an Id.
1939 return Index.getStackIdAtIndex(IdOrIndex);
1940}
1941
1942template <typename DerivedCCG, typename FuncTy, typename CallTy>
1943bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1944 CallTy Call, EdgeIter &EI,
1945 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1946 auto Edge = *EI;
1947 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1948 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1949 // Will be populated in order of callee to caller if we find a chain of tail
1950 // calls between the profiled caller and callee.
1951 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1952 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1953 FoundCalleeChain))
1954 return false;
1955
1956 // The usual case where the profiled callee matches that of the IR/summary.
1957 if (FoundCalleeChain.empty())
1958 return true;
1959
1960 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1961 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1962 // If there is already an edge between these nodes, simply update it and
1963 // return.
1964 if (CurEdge) {
1965 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1966 Edge->ContextIds.end());
1967 CurEdge->AllocTypes |= Edge->AllocTypes;
1968 return;
1969 }
1970 // Otherwise, create a new edge and insert it into the caller and callee
1971 // lists.
1972 auto NewEdge = std::make_shared<ContextEdge>(
1973 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1974 Callee->CallerEdges.push_back(NewEdge);
1975 if (Caller == Edge->Caller) {
1976 // If we are inserting the new edge into the current edge's caller, insert
1977 // the new edge before the current iterator position, and then increment
1978 // back to the current edge.
1979 EI = Caller->CalleeEdges.insert(EI, NewEdge);
1980 ++EI;
1981 assert(*EI == Edge &&
1982 "Iterator position not restored after insert and increment");
1983 } else
1984 Caller->CalleeEdges.push_back(NewEdge);
1985 };
1986
1987 // Create new nodes for each found callee and connect in between the profiled
1988 // caller and callee.
1989 auto *CurCalleeNode = Edge->Callee;
1990 for (auto &[NewCall, Func] : FoundCalleeChain) {
1991 ContextNode *NewNode = nullptr;
1992 // First check if we have already synthesized a node for this tail call.
1993 if (TailCallToContextNodeMap.count(NewCall)) {
1994 NewNode = TailCallToContextNodeMap[NewCall];
1995 NewNode->AllocTypes |= Edge->AllocTypes;
1996 } else {
1997 FuncToCallsWithMetadata[Func].push_back({NewCall});
1998 // Create Node and record node info.
1999 NodeOwner.push_back(
2000 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
2001 NewNode = NodeOwner.back().get();
2002 NodeToCallingFunc[NewNode] = Func;
2003 TailCallToContextNodeMap[NewCall] = NewNode;
2004 NewNode->AllocTypes = Edge->AllocTypes;
2005 }
2006
2007 // Hook up node to its callee node
2008 AddEdge(NewNode, CurCalleeNode);
2009
2010 CurCalleeNode = NewNode;
2011 }
2012
2013 // Hook up edge's original caller to new callee node.
2014 AddEdge(Edge->Caller, CurCalleeNode);
2015
2016 // Remove old edge
2017 Edge->Callee->eraseCallerEdge(Edge.get());
2018 EI = Edge->Caller->CalleeEdges.erase(EI);
2019
2020 // To simplify the increment of EI in the caller, subtract one from EI.
2021 // In the final AddEdge call we would have either added a new callee edge,
2022 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2023 // that there is at least one callee edge.
2024 assert(!Edge->Caller->CalleeEdges.empty());
2025 --EI;
2026
2027 return true;
2028}
2029
2030bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2031 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2032 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2033 bool &FoundMultipleCalleeChains) {
2034 // Stop recursive search if we have already explored the maximum specified
2035 // depth.
2037 return false;
2038
2039 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2040 FoundCalleeChain.push_back({Callsite, F});
2041 };
2042
2043 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2044 if (!CalleeFunc) {
2045 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2046 assert(Alias);
2047 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2048 assert(CalleeFunc);
2049 }
2050
2051 // Look for tail calls in this function, and check if they either call the
2052 // profiled callee directly, or indirectly (via a recursive search).
2053 // Only succeed if there is a single unique tail call chain found between the
2054 // profiled caller and callee, otherwise we could perform incorrect cloning.
2055 bool FoundSingleCalleeChain = false;
2056 for (auto &BB : *CalleeFunc) {
2057 for (auto &I : BB) {
2058 auto *CB = dyn_cast<CallBase>(&I);
2059 if (!CB || !CB->isTailCall())
2060 continue;
2061 auto *CalledValue = CB->getCalledOperand();
2062 auto *CalledFunction = CB->getCalledFunction();
2063 if (CalledValue && !CalledFunction) {
2064 CalledValue = CalledValue->stripPointerCasts();
2065 // Stripping pointer casts can reveal a called function.
2066 CalledFunction = dyn_cast<Function>(CalledValue);
2067 }
2068 // Check if this is an alias to a function. If so, get the
2069 // called aliasee for the checks below.
2070 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2071 assert(!CalledFunction &&
2072 "Expected null called function in callsite for alias");
2073 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2074 }
2075 if (!CalledFunction)
2076 continue;
2077 if (CalledFunction == ProfiledCallee) {
2078 if (FoundSingleCalleeChain) {
2079 FoundMultipleCalleeChains = true;
2080 return false;
2081 }
2082 FoundSingleCalleeChain = true;
2083 FoundProfiledCalleeCount++;
2084 FoundProfiledCalleeDepth += Depth;
2085 if (Depth > FoundProfiledCalleeMaxDepth)
2086 FoundProfiledCalleeMaxDepth = Depth;
2087 SaveCallsiteInfo(&I, CalleeFunc);
2088 } else if (findProfiledCalleeThroughTailCalls(
2089 ProfiledCallee, CalledFunction, Depth + 1,
2090 FoundCalleeChain, FoundMultipleCalleeChains)) {
2091 // findProfiledCalleeThroughTailCalls should not have returned
2092 // true if FoundMultipleCalleeChains.
2093 assert(!FoundMultipleCalleeChains);
2094 if (FoundSingleCalleeChain) {
2095 FoundMultipleCalleeChains = true;
2096 return false;
2097 }
2098 FoundSingleCalleeChain = true;
2099 SaveCallsiteInfo(&I, CalleeFunc);
2100 } else if (FoundMultipleCalleeChains)
2101 return false;
2102 }
2103 }
2104
2105 return FoundSingleCalleeChain;
2106}
2107
2108bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2109 Instruction *Call, const Function *Func, const Function *CallerFunc,
2110 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2111 auto *CB = dyn_cast<CallBase>(Call);
2112 if (!CB->getCalledOperand())
2113 return false;
2114 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2115 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2116 if (CalleeFunc == Func)
2117 return true;
2118 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2119 if (Alias && Alias->getAliasee() == Func)
2120 return true;
2121
2122 // Recursively search for the profiled callee through tail calls starting with
2123 // the actual Callee. The discovered tail call chain is saved in
2124 // FoundCalleeChain, and we will fixup the graph to include these callsites
2125 // after returning.
2126 // FIXME: We will currently redo the same recursive walk if we find the same
2127 // mismatched callee from another callsite. We can improve this with more
2128 // bookkeeping of the created chain of new nodes for each mismatch.
2129 unsigned Depth = 1;
2130 bool FoundMultipleCalleeChains = false;
2131 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2132 FoundCalleeChain,
2133 FoundMultipleCalleeChains)) {
2134 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2135 << Func->getName() << " from " << CallerFunc->getName()
2136 << " that actually called " << CalleeVal->getName()
2137 << (FoundMultipleCalleeChains
2138 ? " (found multiple possible chains)"
2139 : "")
2140 << "\n");
2141 if (FoundMultipleCalleeChains)
2142 FoundProfiledCalleeNonUniquelyCount++;
2143 return false;
2144 }
2145
2146 return true;
2147}
2148
2149bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2150 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2151 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2152 bool &FoundMultipleCalleeChains) {
2153 // Stop recursive search if we have already explored the maximum specified
2154 // depth.
2156 return false;
2157
2158 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2159 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2160 // been synthesized.
2161 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2162 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2163 // StackIds is empty (we don't have debug info available in the index for
2164 // these callsites)
2165 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2166 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2167 CallsiteInfo *NewCallsiteInfo =
2168 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2169 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2170 };
2171
2172 // Look for tail calls in this function, and check if they either call the
2173 // profiled callee directly, or indirectly (via a recursive search).
2174 // Only succeed if there is a single unique tail call chain found between the
2175 // profiled caller and callee, otherwise we could perform incorrect cloning.
2176 bool FoundSingleCalleeChain = false;
2177 for (auto &S : CurCallee.getSummaryList()) {
2178 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2179 !isPrevailing(CurCallee.getGUID(), S.get()))
2180 continue;
2181 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2182 if (!FS)
2183 continue;
2184 auto FSVI = CurCallee;
2185 auto *AS = dyn_cast<AliasSummary>(S.get());
2186 if (AS)
2187 FSVI = AS->getAliaseeVI();
2188 for (auto &CallEdge : FS->calls()) {
2189 if (!CallEdge.second.hasTailCall())
2190 continue;
2191 if (CallEdge.first == ProfiledCallee) {
2192 if (FoundSingleCalleeChain) {
2193 FoundMultipleCalleeChains = true;
2194 return false;
2195 }
2196 FoundSingleCalleeChain = true;
2197 FoundProfiledCalleeCount++;
2198 FoundProfiledCalleeDepth += Depth;
2199 if (Depth > FoundProfiledCalleeMaxDepth)
2200 FoundProfiledCalleeMaxDepth = Depth;
2201 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2202 // Add FS to FSToVIMap in case it isn't already there.
2203 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2204 FSToVIMap[FS] = FSVI;
2205 } else if (findProfiledCalleeThroughTailCalls(
2206 ProfiledCallee, CallEdge.first, Depth + 1,
2207 FoundCalleeChain, FoundMultipleCalleeChains)) {
2208 // findProfiledCalleeThroughTailCalls should not have returned
2209 // true if FoundMultipleCalleeChains.
2210 assert(!FoundMultipleCalleeChains);
2211 if (FoundSingleCalleeChain) {
2212 FoundMultipleCalleeChains = true;
2213 return false;
2214 }
2215 FoundSingleCalleeChain = true;
2216 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2217 // Add FS to FSToVIMap in case it isn't already there.
2218 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2219 FSToVIMap[FS] = FSVI;
2220 } else if (FoundMultipleCalleeChains)
2221 return false;
2222 }
2223 }
2224
2225 return FoundSingleCalleeChain;
2226}
2227
2228bool IndexCallsiteContextGraph::calleeMatchesFunc(
2229 IndexCall &Call, const FunctionSummary *Func,
2230 const FunctionSummary *CallerFunc,
2231 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2233 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2234 // If there is no summary list then this is a call to an externally defined
2235 // symbol.
2236 AliasSummary *Alias =
2237 Callee.getSummaryList().empty()
2238 ? nullptr
2239 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2240 assert(FSToVIMap.count(Func));
2241 auto FuncVI = FSToVIMap[Func];
2242 if (Callee == FuncVI ||
2243 // If callee is an alias, check the aliasee, since only function
2244 // summary base objects will contain the stack node summaries and thus
2245 // get a context node.
2246 (Alias && Alias->getAliaseeVI() == FuncVI))
2247 return true;
2248
2249 // Recursively search for the profiled callee through tail calls starting with
2250 // the actual Callee. The discovered tail call chain is saved in
2251 // FoundCalleeChain, and we will fixup the graph to include these callsites
2252 // after returning.
2253 // FIXME: We will currently redo the same recursive walk if we find the same
2254 // mismatched callee from another callsite. We can improve this with more
2255 // bookkeeping of the created chain of new nodes for each mismatch.
2256 unsigned Depth = 1;
2257 bool FoundMultipleCalleeChains = false;
2258 if (!findProfiledCalleeThroughTailCalls(
2259 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2260 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2261 << " from " << FSToVIMap[CallerFunc]
2262 << " that actually called " << Callee
2263 << (FoundMultipleCalleeChains
2264 ? " (found multiple possible chains)"
2265 : "")
2266 << "\n");
2267 if (FoundMultipleCalleeChains)
2268 FoundProfiledCalleeNonUniquelyCount++;
2269 return false;
2270 }
2271
2272 return true;
2273}
2274
2275template <typename DerivedCCG, typename FuncTy, typename CallTy>
2276void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2277 const {
2278 print(dbgs());
2279 dbgs() << "\n";
2280}
2281
2282template <typename DerivedCCG, typename FuncTy, typename CallTy>
2283void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2284 raw_ostream &OS) const {
2285 OS << "Node " << this << "\n";
2286 OS << "\t";
2287 printCall(OS);
2288 if (Recursive)
2289 OS << " (recursive)";
2290 OS << "\n";
2291 if (!MatchingCalls.empty()) {
2292 OS << "\tMatchingCalls:\n";
2293 for (auto &MatchingCall : MatchingCalls) {
2294 OS << "\t";
2295 MatchingCall.print(OS);
2296 OS << "\n";
2297 }
2298 }
2299 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2300 OS << "\tContextIds:";
2301 // Make a copy of the computed context ids that we can sort for stability.
2302 auto ContextIds = getContextIds();
2303 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2304 std::sort(SortedIds.begin(), SortedIds.end());
2305 for (auto Id : SortedIds)
2306 OS << " " << Id;
2307 OS << "\n";
2308 OS << "\tCalleeEdges:\n";
2309 for (auto &Edge : CalleeEdges)
2310 OS << "\t\t" << *Edge << "\n";
2311 OS << "\tCallerEdges:\n";
2312 for (auto &Edge : CallerEdges)
2313 OS << "\t\t" << *Edge << "\n";
2314 if (!Clones.empty()) {
2315 OS << "\tClones: ";
2316 FieldSeparator FS;
2317 for (auto *Clone : Clones)
2318 OS << FS << Clone;
2319 OS << "\n";
2320 } else if (CloneOf) {
2321 OS << "\tClone of " << CloneOf << "\n";
2322 }
2323}
2324
2325template <typename DerivedCCG, typename FuncTy, typename CallTy>
2326void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2327 const {
2328 print(dbgs());
2329 dbgs() << "\n";
2330}
2331
2332template <typename DerivedCCG, typename FuncTy, typename CallTy>
2333void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2334 raw_ostream &OS) const {
2335 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2336 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2337 OS << " ContextIds:";
2338 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2339 std::sort(SortedIds.begin(), SortedIds.end());
2340 for (auto Id : SortedIds)
2341 OS << " " << Id;
2342}
2343
2344template <typename DerivedCCG, typename FuncTy, typename CallTy>
2345void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2346 print(dbgs());
2347}
2348
2349template <typename DerivedCCG, typename FuncTy, typename CallTy>
2350void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2351 raw_ostream &OS) const {
2352 OS << "Callsite Context Graph:\n";
2353 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2354 for (const auto Node : nodes<GraphType>(this)) {
2355 if (Node->isRemoved())
2356 continue;
2357 Node->print(OS);
2358 OS << "\n";
2359 }
2360}
2361
2362template <typename DerivedCCG, typename FuncTy, typename CallTy>
2363void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2364 raw_ostream &OS) const {
2365 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2366 for (const auto Node : nodes<GraphType>(this)) {
2367 if (Node->isRemoved())
2368 continue;
2369 if (!Node->IsAllocation)
2370 continue;
2371 DenseSet<uint32_t> ContextIds = Node->getContextIds();
2372 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2373 std::sort(SortedIds.begin(), SortedIds.end());
2374 for (auto Id : SortedIds) {
2375 auto SizeI = ContextIdToTotalSize.find(Id);
2376 assert(SizeI != ContextIdToTotalSize.end());
2377 auto TypeI = ContextIdToAllocationType.find(Id);
2378 assert(TypeI != ContextIdToAllocationType.end());
2379 OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id
2380 << " with total size " << SizeI->second << " is "
2381 << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
2382 }
2383 }
2384}
2385
2386template <typename DerivedCCG, typename FuncTy, typename CallTy>
2387void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2388 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2389 for (const auto Node : nodes<GraphType>(this)) {
2390 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2391 for (auto &Edge : Node->CallerEdges)
2392 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2393 }
2394}
2395
2396template <typename DerivedCCG, typename FuncTy, typename CallTy>
2397struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2398 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2399 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2400
2401 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2402 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2403
2406 decltype(&getNode)>;
2407
2409 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2410 }
2411
2413 return nodes_iterator(G->NodeOwner.end(), &getNode);
2414 }
2415
2417 return G->NodeOwner.begin()->get();
2418 }
2419
2420 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2421 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2423 return P->Callee;
2424 }
2425
2427 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2428 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2429 decltype(&GetCallee)>;
2430
2432 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2433 }
2434
2436 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2437 }
2438};
2439
2440template <typename DerivedCCG, typename FuncTy, typename CallTy>
2441struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2442 : public DefaultDOTGraphTraits {
2443 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2444
2445 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2447 using NodeRef = typename GTraits::NodeRef;
2448 using ChildIteratorType = typename GTraits::ChildIteratorType;
2449
2450 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2451 std::string LabelString =
2452 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2453 Twine(Node->OrigStackOrAllocId))
2454 .str();
2455 LabelString += "\n";
2456 if (Node->hasCall()) {
2457 auto Func = G->NodeToCallingFunc.find(Node);
2458 assert(Func != G->NodeToCallingFunc.end());
2459 LabelString +=
2460 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2461 } else {
2462 LabelString += "null call";
2463 if (Node->Recursive)
2464 LabelString += " (recursive)";
2465 else
2466 LabelString += " (external)";
2467 }
2468 return LabelString;
2469 }
2470
2471 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2472 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2473 getContextIds(Node->getContextIds()) + "\"")
2474 .str();
2476 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2477 AttributeString += ",style=\"filled\"";
2478 if (Node->CloneOf) {
2479 AttributeString += ",color=\"blue\"";
2480 AttributeString += ",style=\"filled,bold,dashed\"";
2481 } else
2482 AttributeString += ",style=\"filled\"";
2483 return AttributeString;
2484 }
2485
2486 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2487 GraphType) {
2488 auto &Edge = *(ChildIter.getCurrent());
2489 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2490 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2491 .str();
2492 }
2493
2494 // Since the NodeOwners list includes nodes that are no longer connected to
2495 // the graph, skip them here.
2496 static bool isNodeHidden(NodeRef Node, GraphType) {
2497 return Node->isRemoved();
2498 }
2499
2500private:
2501 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2502 std::string IdString = "ContextIds:";
2503 if (ContextIds.size() < 100) {
2504 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2505 std::sort(SortedIds.begin(), SortedIds.end());
2506 for (auto Id : SortedIds)
2507 IdString += (" " + Twine(Id)).str();
2508 } else {
2509 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2510 }
2511 return IdString;
2512 }
2513
2514 static std::string getColor(uint8_t AllocTypes) {
2515 if (AllocTypes == (uint8_t)AllocationType::NotCold)
2516 // Color "brown1" actually looks like a lighter red.
2517 return "brown1";
2518 if (AllocTypes == (uint8_t)AllocationType::Cold)
2519 return "cyan";
2520 if (AllocTypes ==
2521 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2522 // Lighter purple.
2523 return "mediumorchid1";
2524 return "gray";
2525 }
2526
2527 static std::string getNodeId(NodeRef Node) {
2528 std::stringstream SStream;
2529 SStream << std::hex << "N0x" << (unsigned long long)Node;
2530 std::string Result = SStream.str();
2531 return Result;
2532 }
2533};
2534
2535template <typename DerivedCCG, typename FuncTy, typename CallTy>
2536void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2537 std::string Label) const {
2538 WriteGraph(this, "", false, Label,
2539 DotFilePathPrefix + "ccg." + Label + ".dot");
2540}
2541
2542template <typename DerivedCCG, typename FuncTy, typename CallTy>
2543typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
2544CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2545 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
2546 DenseSet<uint32_t> ContextIdsToMove) {
2547 ContextNode *Node = Edge->Callee;
2548 NodeOwner.push_back(
2549 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2550 ContextNode *Clone = NodeOwner.back().get();
2551 Node->addClone(Clone);
2552 Clone->MatchingCalls = Node->MatchingCalls;
2553 assert(NodeToCallingFunc.count(Node));
2554 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2555 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
2556 ContextIdsToMove);
2557 return Clone;
2558}
2559
2560template <typename DerivedCCG, typename FuncTy, typename CallTy>
2561void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2562 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2563 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2564 bool NewClone,
2565 DenseSet<uint32_t> ContextIdsToMove) {
2566 // NewCallee and Edge's current callee must be clones of the same original
2567 // node (Edge's current callee may be the original node too).
2568 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2569
2570 ContextNode *OldCallee = Edge->Callee;
2571
2572 // We might already have an edge to the new callee from earlier cloning for a
2573 // different allocation. If one exists we will reuse it.
2574 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
2575
2576 // Callers will pass an empty ContextIdsToMove set when they want to move the
2577 // edge. Copy in Edge's ids for simplicity.
2578 if (ContextIdsToMove.empty())
2579 ContextIdsToMove = Edge->getContextIds();
2580
2581 // If we are moving all of Edge's ids, then just move the whole Edge.
2582 // Otherwise only move the specified subset, to a new edge if needed.
2583 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
2584 // Moving the whole Edge.
2585 if (CallerEdgeI)
2586 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2587 else
2588 OldCallee->eraseCallerEdge(Edge.get());
2589 if (ExistingEdgeToNewCallee) {
2590 // Since we already have an edge to NewCallee, simply move the ids
2591 // onto it, and remove the existing Edge.
2592 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2593 ContextIdsToMove.end());
2594 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
2595 assert(Edge->ContextIds == ContextIdsToMove);
2596 Edge->ContextIds.clear();
2597 Edge->AllocTypes = (uint8_t)AllocationType::None;
2598 Edge->Caller->eraseCalleeEdge(Edge.get());
2599 } else {
2600 // Otherwise just reconnect Edge to NewCallee.
2601 Edge->Callee = NewCallee;
2602 NewCallee->CallerEdges.push_back(Edge);
2603 // Don't need to update Edge's context ids since we are simply
2604 // reconnecting it.
2605 }
2606 // In either case, need to update the alloc types on New Callee.
2607 NewCallee->AllocTypes |= Edge->AllocTypes;
2608 } else {
2609 // Only moving a subset of Edge's ids.
2610 if (CallerEdgeI)
2611 ++CallerEdgeI;
2612 // Compute the alloc type of the subset of ids being moved.
2613 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
2614 if (ExistingEdgeToNewCallee) {
2615 // Since we already have an edge to NewCallee, simply move the ids
2616 // onto it.
2617 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2618 ContextIdsToMove.end());
2619 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
2620 } else {
2621 // Otherwise, create a new edge to NewCallee for the ids being moved.
2622 auto NewEdge = std::make_shared<ContextEdge>(
2623 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
2624 Edge->Caller->CalleeEdges.push_back(NewEdge);
2625 NewCallee->CallerEdges.push_back(NewEdge);
2626 }
2627 // In either case, need to update the alloc types on NewCallee, and remove
2628 // those ids and update the alloc type on the original Edge.
2629 NewCallee->AllocTypes |= CallerEdgeAllocType;
2630 set_subtract(Edge->ContextIds, ContextIdsToMove);
2631 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
2632 }
2633 // Now walk the old callee node's callee edges and move Edge's context ids
2634 // over to the corresponding edge into the clone (which is created here if
2635 // this is a newly created clone).
2636 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2637 // The context ids moving to the new callee are the subset of this edge's
2638 // context ids and the context ids on the caller edge being moved.
2639 DenseSet<uint32_t> EdgeContextIdsToMove =
2640 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
2641 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2642 OldCalleeEdge->AllocTypes =
2643 computeAllocType(OldCalleeEdge->getContextIds());
2644 if (!NewClone) {
2645 // Update context ids / alloc type on corresponding edge to NewCallee.
2646 // There is a chance this may not exist if we are reusing an existing
2647 // clone, specifically during function assignment, where we would have
2648 // removed none type edges after creating the clone. If we can't find
2649 // a corresponding edge there, fall through to the cloning below.
2650 if (auto *NewCalleeEdge =
2651 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2652 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2653 EdgeContextIdsToMove.end());
2654 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2655 continue;
2656 }
2657 }
2658 auto NewEdge = std::make_shared<ContextEdge>(
2659 OldCalleeEdge->Callee, NewCallee,
2660 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2661 NewCallee->CalleeEdges.push_back(NewEdge);
2662 NewEdge->Callee->CallerEdges.push_back(NewEdge);
2663 }
2664 // Recompute the node alloc type now that its callee edges have been
2665 // updated (since we will compute from those edges).
2666 OldCallee->AllocTypes = OldCallee->computeAllocType();
2667 // OldCallee alloc type should be None iff its context id set is now empty.
2668 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2669 OldCallee->emptyContextIds());
2670 if (VerifyCCG) {
2671 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2672 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2673 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2674 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2675 /*CheckEdges=*/false);
2676 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2677 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2678 /*CheckEdges=*/false);
2679 }
2680}
2681
2682template <typename DerivedCCG, typename FuncTy, typename CallTy>
2683void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2684 recursivelyRemoveNoneTypeCalleeEdges(
2685 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2686 auto Inserted = Visited.insert(Node);
2687 if (!Inserted.second)
2688 return;
2689
2690 removeNoneTypeCalleeEdges(Node);
2691
2692 for (auto *Clone : Node->Clones)
2693 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
2694
2695 // The recursive call may remove some of this Node's caller edges.
2696 // Iterate over a copy and skip any that were removed.
2697 auto CallerEdges = Node->CallerEdges;
2698 for (auto &Edge : CallerEdges) {
2699 // Skip any that have been removed by an earlier recursive call.
2700 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2701 assert(!is_contained(Node->CallerEdges, Edge));
2702 continue;
2703 }
2704 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
2705 }
2706}
2707
2708template <typename DerivedCCG, typename FuncTy, typename CallTy>
2709void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2711 for (auto &Entry : AllocationCallToContextNodeMap) {
2712 Visited.clear();
2713 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
2714 }
2715 Visited.clear();
2716 for (auto &Entry : AllocationCallToContextNodeMap)
2717 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
2718 if (VerifyCCG)
2719 check();
2720}
2721
2722// helper function to check an AllocType is cold or notcold or both.
2724 return (AllocType == (uint8_t)AllocationType::Cold) ||
2725 (AllocType == (uint8_t)AllocationType::NotCold) ||
2726 (AllocType ==
2727 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2728}
2729
2730template <typename DerivedCCG, typename FuncTy, typename CallTy>
2731void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2732 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
2733 const DenseSet<uint32_t> &AllocContextIds) {
2734 if (VerifyNodes)
2735 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2736 assert(!Node->CloneOf);
2737
2738 // If Node as a null call, then either it wasn't found in the module (regular
2739 // LTO) or summary index (ThinLTO), or there were other conditions blocking
2740 // cloning (e.g. recursion, calls multiple targets, etc).
2741 // Do this here so that we don't try to recursively clone callers below, which
2742 // isn't useful at least for this node.
2743 if (!Node->hasCall())
2744 return;
2745
2746#ifndef NDEBUG
2747 auto Insert =
2748#endif
2749 Visited.insert(Node);
2750 // We should not have visited this node yet.
2751 assert(Insert.second);
2752 // The recursive call to identifyClones may delete the current edge from the
2753 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2754 // in an iterator and having recursive call erase from it. Other edges may
2755 // also get removed during the recursion, which will have null Callee and
2756 // Caller pointers (and are deleted later), so we skip those below.
2757 {
2758 auto CallerEdges = Node->CallerEdges;
2759 for (auto &Edge : CallerEdges) {
2760 // Skip any that have been removed by an earlier recursive call.
2761 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2762 assert(!llvm::count(Node->CallerEdges, Edge));
2763 continue;
2764 }
2765 // Ignore any caller we previously visited via another edge.
2766 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2767 identifyClones(Edge->Caller, Visited, AllocContextIds);
2768 }
2769 }
2770 }
2771
2772 // Check if we reached an unambiguous call or have have only a single caller.
2773 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2774 return;
2775
2776 // We need to clone.
2777
2778 // Try to keep the original version as alloc type NotCold. This will make
2779 // cases with indirect calls or any other situation with an unknown call to
2780 // the original function get the default behavior. We do this by sorting the
2781 // CallerEdges of the Node we will clone by alloc type.
2782 //
2783 // Give NotCold edge the lowest sort priority so those edges are at the end of
2784 // the caller edges vector, and stay on the original version (since the below
2785 // code clones greedily until it finds all remaining edges have the same type
2786 // and leaves the remaining ones on the original Node).
2787 //
2788 // We shouldn't actually have any None type edges, so the sorting priority for
2789 // that is arbitrary, and we assert in that case below.
2790 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2791 /*Cold*/ 1,
2792 /*NotColdCold*/ 2};
2793 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2794 [&](const std::shared_ptr<ContextEdge> &A,
2795 const std::shared_ptr<ContextEdge> &B) {
2796 // Nodes with non-empty context ids should be sorted before
2797 // those with empty context ids.
2798 if (A->ContextIds.empty())
2799 // Either B ContextIds are non-empty (in which case we
2800 // should return false because B < A), or B ContextIds
2801 // are empty, in which case they are equal, and we should
2802 // maintain the original relative ordering.
2803 return false;
2804 if (B->ContextIds.empty())
2805 return true;
2806
2807 if (A->AllocTypes == B->AllocTypes)
2808 // Use the first context id for each edge as a
2809 // tie-breaker.
2810 return *A->ContextIds.begin() < *B->ContextIds.begin();
2811 return AllocTypeCloningPriority[A->AllocTypes] <
2812 AllocTypeCloningPriority[B->AllocTypes];
2813 });
2814
2815 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2816
2817 // Iterate until we find no more opportunities for disambiguating the alloc
2818 // types via cloning. In most cases this loop will terminate once the Node
2819 // has a single allocation type, in which case no more cloning is needed.
2820 // We need to be able to remove Edge from CallerEdges, so need to adjust
2821 // iterator inside the loop.
2822 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2823 auto CallerEdge = *EI;
2824
2825 // See if cloning the prior caller edge left this node with a single alloc
2826 // type or a single caller. In that case no more cloning of Node is needed.
2827 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2828 break;
2829
2830 // Only need to process the ids along this edge pertaining to the given
2831 // allocation.
2832 auto CallerEdgeContextsForAlloc =
2833 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
2834 if (CallerEdgeContextsForAlloc.empty()) {
2835 ++EI;
2836 continue;
2837 }
2838 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
2839
2840 // Compute the node callee edge alloc types corresponding to the context ids
2841 // for this caller edge.
2842 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2843 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2844 for (auto &CalleeEdge : Node->CalleeEdges)
2845 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2846 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
2847
2848 // Don't clone if doing so will not disambiguate any alloc types amongst
2849 // caller edges (including the callee edges that would be cloned).
2850 // Otherwise we will simply move all edges to the clone.
2851 //
2852 // First check if by cloning we will disambiguate the caller allocation
2853 // type from node's allocation type. Query allocTypeToUse so that we don't
2854 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2855 // neither of these should be None type.
2856 //
2857 // Then check if by cloning node at least one of the callee edges will be
2858 // disambiguated by splitting out different context ids.
2859 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2860 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2861 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
2862 allocTypeToUse(Node->AllocTypes) &&
2863 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2864 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2865 ++EI;
2866 continue;
2867 }
2868
2869 // First see if we can use an existing clone. Check each clone and its
2870 // callee edges for matching alloc types.
2871 ContextNode *Clone = nullptr;
2872 for (auto *CurClone : Node->Clones) {
2873 if (allocTypeToUse(CurClone->AllocTypes) !=
2874 allocTypeToUse(CallerAllocTypeForAlloc))
2875 continue;
2876
2877 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2878 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2879 continue;
2880 Clone = CurClone;
2881 break;
2882 }
2883
2884 // The edge iterator is adjusted when we move the CallerEdge to the clone.
2885 if (Clone)
2886 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,
2887 CallerEdgeContextsForAlloc);
2888 else
2889 Clone =
2890 moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);
2891
2892 assert(EI == Node->CallerEdges.end() ||
2893 Node->AllocTypes != (uint8_t)AllocationType::None);
2894 // Sanity check that no alloc types on clone or its edges are None.
2895 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2896 }
2897
2898 // We should still have some context ids on the original Node.
2899 assert(!Node->emptyContextIds());
2900
2901 // Sanity check that no alloc types on node or edges are None.
2902 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2903
2904 if (VerifyNodes)
2905 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2906}
2907
2908void ModuleCallsiteContextGraph::updateAllocationCall(
2910 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2911 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2912 "memprof", AllocTypeString);
2913 cast<CallBase>(Call.call())->addFnAttr(A);
2914 OREGetter(Call.call()->getFunction())
2915 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2916 << ore::NV("AllocationCall", Call.call()) << " in clone "
2917 << ore::NV("Caller", Call.call()->getFunction())
2918 << " marked with memprof allocation attribute "
2919 << ore::NV("Attribute", AllocTypeString));
2920}
2921
2922void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2924 auto *AI = Call.call().dyn_cast<AllocInfo *>();
2925 assert(AI);
2926 assert(AI->Versions.size() > Call.cloneNo());
2927 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2928}
2929
2930void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2931 FuncInfo CalleeFunc) {
2932 if (CalleeFunc.cloneNo() > 0)
2933 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2934 OREGetter(CallerCall.call()->getFunction())
2935 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2936 << ore::NV("Call", CallerCall.call()) << " in clone "
2937 << ore::NV("Caller", CallerCall.call()->getFunction())
2938 << " assigned to call function clone "
2939 << ore::NV("Callee", CalleeFunc.func()));
2940}
2941
2942void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2943 FuncInfo CalleeFunc) {
2944 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2945 assert(CI &&
2946 "Caller cannot be an allocation which should not have profiled calls");
2947 assert(CI->Clones.size() > CallerCall.cloneNo());
2948 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2949}
2950
2951CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2952 Instruction *>::FuncInfo
2953ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2954 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2955 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2956 // Use existing LLVM facilities for cloning and obtaining Call in clone
2957 ValueToValueMapTy VMap;
2958 auto *NewFunc = CloneFunction(Func.func(), VMap);
2959 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2960 assert(!Func.func()->getParent()->getFunction(Name));
2961 NewFunc->setName(Name);
2962 for (auto &Inst : CallsWithMetadataInFunc) {
2963 // This map always has the initial version in it.
2964 assert(Inst.cloneNo() == 0);
2965 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2966 }
2967 OREGetter(Func.func())
2968 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2969 << "created clone " << ore::NV("NewFunction", NewFunc));
2970 return {NewFunc, CloneNo};
2971}
2972
2973CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2974 IndexCall>::FuncInfo
2975IndexCallsiteContextGraph::cloneFunctionForCallsite(
2976 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2977 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2978 // Check how many clones we have of Call (and therefore function).
2979 // The next clone number is the current size of versions array.
2980 // Confirm this matches the CloneNo provided by the caller, which is based on
2981 // the number of function clones we have.
2982 assert(CloneNo ==
2983 (Call.call().is<AllocInfo *>()
2984 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2985 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2986 // Walk all the instructions in this function. Create a new version for
2987 // each (by adding an entry to the Versions/Clones summary array), and copy
2988 // over the version being called for the function clone being cloned here.
2989 // Additionally, add an entry to the CallMap for the new function clone,
2990 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2991 // to the new call clone.
2992 for (auto &Inst : CallsWithMetadataInFunc) {
2993 // This map always has the initial version in it.
2994 assert(Inst.cloneNo() == 0);
2995 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2996 assert(AI->Versions.size() == CloneNo);
2997 // We assign the allocation type later (in updateAllocationCall), just add
2998 // an entry for it here.
2999 AI->Versions.push_back(0);
3000 } else {
3001 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
3002 assert(CI && CI->Clones.size() == CloneNo);
3003 // We assign the clone number later (in updateCall), just add an entry for
3004 // it here.
3005 CI->Clones.push_back(0);
3006 }
3007 CallMap[Inst] = {Inst.call(), CloneNo};
3008 }
3009 return {Func.func(), CloneNo};
3010}
3011
3012// This method assigns cloned callsites to functions, cloning the functions as
3013// needed. The assignment is greedy and proceeds roughly as follows:
3014//
3015// For each function Func:
3016// For each call with graph Node having clones:
3017// Initialize ClonesWorklist to Node and its clones
3018// Initialize NodeCloneCount to 0
3019// While ClonesWorklist is not empty:
3020// Clone = pop front ClonesWorklist
3021// NodeCloneCount++
3022// If Func has been cloned less than NodeCloneCount times:
3023// If NodeCloneCount is 1:
3024// Assign Clone to original Func
3025// Continue
3026// Create a new function clone
3027// If other callers not assigned to call a function clone yet:
3028// Assign them to call new function clone
3029// Continue
3030// Assign any other caller calling the cloned version to new clone
3031//
3032// For each caller of Clone:
3033// If caller is assigned to call a specific function clone:
3034// If we cannot assign Clone to that function clone:
3035// Create new callsite Clone NewClone
3036// Add NewClone to ClonesWorklist
3037// Continue
3038// Assign Clone to existing caller's called function clone
3039// Else:
3040// If Clone not already assigned to a function clone:
3041// Assign to first function clone without assignment
3042// Assign caller to selected function clone
3043template <typename DerivedCCG, typename FuncTy, typename CallTy>
3044bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
3045 bool Changed = false;
3046
3047 // Keep track of the assignment of nodes (callsites) to function clones they
3048 // call.
3049 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
3050
3051 // Update caller node to call function version CalleeFunc, by recording the
3052 // assignment in CallsiteToCalleeFuncCloneMap.
3053 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
3054 const FuncInfo &CalleeFunc) {
3055 assert(Caller->hasCall());
3056 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
3057 };
3058
3059 // Walk all functions for which we saw calls with memprof metadata, and handle
3060 // cloning for each of its calls.
3061 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
3062 FuncInfo OrigFunc(Func);
3063 // Map from each clone of OrigFunc to a map of remappings of each call of
3064 // interest (from original uncloned call to the corresponding cloned call in
3065 // that function clone).
3066 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
3067 for (auto &Call : CallsWithMetadata) {
3068 ContextNode *Node = getNodeForInst(Call);
3069 // Skip call if we do not have a node for it (all uses of its stack ids
3070 // were either on inlined chains or pruned from the MIBs), or if we did
3071 // not create any clones for it.
3072 if (!Node || Node->Clones.empty())
3073 continue;
3074 assert(Node->hasCall() &&
3075 "Not having a call should have prevented cloning");
3076
3077 // Track the assignment of function clones to clones of the current
3078 // callsite Node being handled.
3079 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3080
3081 // Assign callsite version CallsiteClone to function version FuncClone,
3082 // and also assign (possibly cloned) Call to CallsiteClone.
3083 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3084 CallInfo &Call,
3085 ContextNode *CallsiteClone,
3086 bool IsAlloc) {
3087 // Record the clone of callsite node assigned to this function clone.
3088 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3089
3090 assert(FuncClonesToCallMap.count(FuncClone));
3091 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3092 CallInfo CallClone(Call);
3093 if (CallMap.count(Call))
3094 CallClone = CallMap[Call];
3095 CallsiteClone->setCall(CallClone);
3096 // Need to do the same for all matching calls.
3097 for (auto &MatchingCall : Node->MatchingCalls) {
3098 CallInfo CallClone(MatchingCall);
3099 if (CallMap.count(MatchingCall))
3100 CallClone = CallMap[MatchingCall];
3101 // Updates the call in the list.
3102 MatchingCall = CallClone;
3103 }
3104 };
3105
3106 // Keep track of the clones of callsite Node that need to be assigned to
3107 // function clones. This list may be expanded in the loop body below if we
3108 // find additional cloning is required.
3109 std::deque<ContextNode *> ClonesWorklist;
3110 // Ignore original Node if we moved all of its contexts to clones.
3111 if (!Node->emptyContextIds())
3112 ClonesWorklist.push_back(Node);
3113 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3114 Node->Clones.end());
3115
3116 // Now walk through all of the clones of this callsite Node that we need,
3117 // and determine the assignment to a corresponding clone of the current
3118 // function (creating new function clones as needed).
3119 unsigned NodeCloneCount = 0;
3120 while (!ClonesWorklist.empty()) {
3121 ContextNode *Clone = ClonesWorklist.front();
3122 ClonesWorklist.pop_front();
3123 NodeCloneCount++;
3124 if (VerifyNodes)
3125 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3126
3127 // Need to create a new function clone if we have more callsite clones
3128 // than existing function clones, which would have been assigned to an
3129 // earlier clone in the list (we assign callsite clones to function
3130 // clones greedily).
3131 if (FuncClonesToCallMap.size() < NodeCloneCount) {
3132 // If this is the first callsite copy, assign to original function.
3133 if (NodeCloneCount == 1) {
3134 // Since FuncClonesToCallMap is empty in this case, no clones have
3135 // been created for this function yet, and no callers should have
3136 // been assigned a function clone for this callee node yet.
3138 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3139 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3140 }));
3141 // Initialize with empty call map, assign Clone to original function
3142 // and its callers, and skip to the next clone.
3143 FuncClonesToCallMap[OrigFunc] = {};
3144 AssignCallsiteCloneToFuncClone(
3145 OrigFunc, Call, Clone,
3146 AllocationCallToContextNodeMap.count(Call));
3147 for (auto &CE : Clone->CallerEdges) {
3148 // Ignore any caller that does not have a recorded callsite Call.
3149 if (!CE->Caller->hasCall())
3150 continue;
3151 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3152 }
3153 continue;
3154 }
3155
3156 // First locate which copy of OrigFunc to clone again. If a caller
3157 // of this callsite clone was already assigned to call a particular
3158 // function clone, we need to redirect all of those callers to the
3159 // new function clone, and update their other callees within this
3160 // function.
3161 FuncInfo PreviousAssignedFuncClone;
3162 auto EI = llvm::find_if(
3163 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3164 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3165 });
3166 bool CallerAssignedToCloneOfFunc = false;
3167 if (EI != Clone->CallerEdges.end()) {
3168 const std::shared_ptr<ContextEdge> &Edge = *EI;
3169 PreviousAssignedFuncClone =
3170 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3171 CallerAssignedToCloneOfFunc = true;
3172 }
3173
3174 // Clone function and save it along with the CallInfo map created
3175 // during cloning in the FuncClonesToCallMap.
3176 std::map<CallInfo, CallInfo> NewCallMap;
3177 unsigned CloneNo = FuncClonesToCallMap.size();
3178 assert(CloneNo > 0 && "Clone 0 is the original function, which "
3179 "should already exist in the map");
3180 FuncInfo NewFuncClone = cloneFunctionForCallsite(
3181 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3182 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3183 FunctionClonesAnalysis++;
3184 Changed = true;
3185
3186 // If no caller callsites were already assigned to a clone of this
3187 // function, we can simply assign this clone to the new func clone
3188 // and update all callers to it, then skip to the next clone.
3189 if (!CallerAssignedToCloneOfFunc) {
3190 AssignCallsiteCloneToFuncClone(
3191 NewFuncClone, Call, Clone,
3192 AllocationCallToContextNodeMap.count(Call));
3193 for (auto &CE : Clone->CallerEdges) {
3194 // Ignore any caller that does not have a recorded callsite Call.
3195 if (!CE->Caller->hasCall())
3196 continue;
3197 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3198 }
3199 continue;
3200 }
3201
3202 // We may need to do additional node cloning in this case.
3203 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3204 // that were previously assigned to call PreviousAssignedFuncClone,
3205 // to record that they now call NewFuncClone.
3206 for (auto CE : Clone->CallerEdges) {
3207 // Skip any that have been removed on an earlier iteration.
3208 if (!CE)
3209 continue;
3210 // Ignore any caller that does not have a recorded callsite Call.
3211 if (!CE->Caller->hasCall())
3212 continue;
3213
3214 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3215 // We subsequently fall through to later handling that
3216 // will perform any additional cloning required for
3217 // callers that were calling other function clones.
3218 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3219 PreviousAssignedFuncClone)
3220 continue;
3221
3222 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3223
3224 // If we are cloning a function that was already assigned to some
3225 // callers, then essentially we are creating new callsite clones
3226 // of the other callsites in that function that are reached by those
3227 // callers. Clone the other callees of the current callsite's caller
3228 // that were already assigned to PreviousAssignedFuncClone
3229 // accordingly. This is important since we subsequently update the
3230 // calls from the nodes in the graph and their assignments to callee
3231 // functions recorded in CallsiteToCalleeFuncCloneMap.
3232 for (auto CalleeEdge : CE->Caller->CalleeEdges) {
3233 // Skip any that have been removed on an earlier iteration when
3234 // cleaning up newly None type callee edges.
3235 if (!CalleeEdge)
3236 continue;
3237 ContextNode *Callee = CalleeEdge->Callee;
3238 // Skip the current callsite, we are looking for other
3239 // callsites Caller calls, as well as any that does not have a
3240 // recorded callsite Call.
3241 if (Callee == Clone || !Callee->hasCall())
3242 continue;
3243 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3244 removeNoneTypeCalleeEdges(NewClone);
3245 // Moving the edge may have resulted in some none type
3246 // callee edges on the original Callee.
3247 removeNoneTypeCalleeEdges(Callee);
3248 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3249 // If the Callee node was already assigned to call a specific
3250 // function version, make sure its new clone is assigned to call
3251 // that same function clone.
3252 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3253 RecordCalleeFuncOfCallsite(
3254 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3255 // Update NewClone with the new Call clone of this callsite's Call
3256 // created for the new function clone created earlier.
3257 // Recall that we have already ensured when building the graph
3258 // that each caller can only call callsites within the same
3259 // function, so we are guaranteed that Callee Call is in the
3260 // current OrigFunc.
3261 // CallMap is set up as indexed by original Call at clone 0.
3262 CallInfo OrigCall(Callee->getOrigNode()->Call);
3263 OrigCall.setCloneNo(0);
3264 std::map<CallInfo, CallInfo> &CallMap =
3265 FuncClonesToCallMap[NewFuncClone];
3266 assert(CallMap.count(OrigCall));
3267 CallInfo NewCall(CallMap[OrigCall]);
3268 assert(NewCall);
3269 NewClone->setCall(NewCall);
3270 // Need to do the same for all matching calls.
3271 for (auto &MatchingCall : NewClone->MatchingCalls) {
3272 CallInfo OrigMatchingCall(MatchingCall);
3273 OrigMatchingCall.setCloneNo(0);
3274 assert(CallMap.count(OrigMatchingCall));
3275 CallInfo NewCall(CallMap[OrigMatchingCall]);
3276 assert(NewCall);
3277 // Updates the call in the list.
3278 MatchingCall = NewCall;
3279 }
3280 }
3281 }
3282 // Fall through to handling below to perform the recording of the
3283 // function for this callsite clone. This enables handling of cases
3284 // where the callers were assigned to different clones of a function.
3285 }
3286
3287 // See if we can use existing function clone. Walk through
3288 // all caller edges to see if any have already been assigned to
3289 // a clone of this callsite's function. If we can use it, do so. If not,
3290 // because that function clone is already assigned to a different clone
3291 // of this callsite, then we need to clone again.
3292 // Basically, this checking is needed to handle the case where different
3293 // caller functions/callsites may need versions of this function
3294 // containing different mixes of callsite clones across the different
3295 // callsites within the function. If that happens, we need to create
3296 // additional function clones to handle the various combinations.
3297 //
3298 // Keep track of any new clones of this callsite created by the
3299 // following loop, as well as any existing clone that we decided to
3300 // assign this clone to.
3301 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3302 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3303 // We need to be able to remove Edge from CallerEdges, so need to adjust
3304 // iterator in the loop.
3305 for (auto EI = Clone->CallerEdges.begin();
3306 EI != Clone->CallerEdges.end();) {
3307 auto Edge = *EI;
3308 // Ignore any caller that does not have a recorded callsite Call.
3309 if (!Edge->Caller->hasCall()) {
3310 EI++;
3311 continue;
3312 }
3313 // If this caller already assigned to call a version of OrigFunc, need
3314 // to ensure we can assign this callsite clone to that function clone.
3315 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3316 FuncInfo FuncCloneCalledByCaller =
3317 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3318 // First we need to confirm that this function clone is available
3319 // for use by this callsite node clone.
3320 //
3321 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3322 // its callsite clones, one of those callsite clones X could have
3323 // been assigned to the same function clone called by Edge's caller
3324 // - if Edge's caller calls another callsite within Node's original
3325 // function, and that callsite has another caller reaching clone X.
3326 // We need to clone Node again in this case.
3327 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3328 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3329 Clone) ||
3330 // Detect when we have multiple callers of this callsite that
3331 // have already been assigned to specific, and different, clones
3332 // of OrigFunc (due to other unrelated callsites in Func they
3333 // reach via call contexts). Is this Clone of callsite Node
3334 // assigned to a different clone of OrigFunc? If so, clone Node
3335 // again.
3336 (FuncCloneAssignedToCurCallsiteClone &&
3337 FuncCloneAssignedToCurCallsiteClone !=
3338 FuncCloneCalledByCaller)) {
3339 // We need to use a different newly created callsite clone, in
3340 // order to assign it to another new function clone on a
3341 // subsequent iteration over the Clones array (adjusted below).
3342 // Note we specifically do not reset the
3343 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3344 // when this new clone is processed later we know which version of
3345 // the function to copy (so that other callsite clones we have
3346 // assigned to that function clone are properly cloned over). See
3347 // comments in the function cloning handling earlier.
3348
3349 // Check if we already have cloned this callsite again while
3350 // walking through caller edges, for a caller calling the same
3351 // function clone. If so, we can move this edge to that new clone
3352 // rather than creating yet another new clone.
3353 if (FuncCloneToNewCallsiteCloneMap.count(
3354 FuncCloneCalledByCaller)) {
3355 ContextNode *NewClone =
3356 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3357 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3358 // Cleanup any none type edges cloned over.
3359 removeNoneTypeCalleeEdges(NewClone);
3360 } else {
3361 // Create a new callsite clone.
3362 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3363 removeNoneTypeCalleeEdges(NewClone);
3364 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3365 NewClone;
3366 // Add to list of clones and process later.
3367 ClonesWorklist.push_back(NewClone);
3368 assert(EI == Clone->CallerEdges.end() ||
3369 Clone->AllocTypes != (uint8_t)AllocationType::None);
3370 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3371 }
3372 // Moving the caller edge may have resulted in some none type
3373 // callee edges.
3374 removeNoneTypeCalleeEdges(Clone);
3375 // We will handle the newly created callsite clone in a subsequent
3376 // iteration over this Node's Clones. Continue here since we
3377 // already adjusted iterator EI while moving the edge.
3378 continue;
3379 }
3380
3381 // Otherwise, we can use the function clone already assigned to this
3382 // caller.
3383 if (!FuncCloneAssignedToCurCallsiteClone) {
3384 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3385 // Assign Clone to FuncCloneCalledByCaller
3386 AssignCallsiteCloneToFuncClone(
3387 FuncCloneCalledByCaller, Call, Clone,
3388 AllocationCallToContextNodeMap.count(Call));
3389 } else
3390 // Don't need to do anything - callsite is already calling this
3391 // function clone.
3392 assert(FuncCloneAssignedToCurCallsiteClone ==
3393 FuncCloneCalledByCaller);
3394
3395 } else {
3396 // We have not already assigned this caller to a version of
3397 // OrigFunc. Do the assignment now.
3398
3399 // First check if we have already assigned this callsite clone to a
3400 // clone of OrigFunc for another caller during this iteration over
3401 // its caller edges.
3402 if (!FuncCloneAssignedToCurCallsiteClone) {
3403 // Find first function in FuncClonesToCallMap without an assigned
3404 // clone of this callsite Node. We should always have one
3405 // available at this point due to the earlier cloning when the
3406 // FuncClonesToCallMap size was smaller than the clone number.
3407 for (auto &CF : FuncClonesToCallMap) {
3408 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3409 FuncCloneAssignedToCurCallsiteClone = CF.first;
3410 break;
3411 }
3412 }
3413 assert(FuncCloneAssignedToCurCallsiteClone);
3414 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3415 AssignCallsiteCloneToFuncClone(
3416 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3417 AllocationCallToContextNodeMap.count(Call));
3418 } else
3419 assert(FuncCloneToCurNodeCloneMap
3420 [FuncCloneAssignedToCurCallsiteClone] == Clone);
3421 // Update callers to record function version called.
3422 RecordCalleeFuncOfCallsite(Edge->Caller,
3423 FuncCloneAssignedToCurCallsiteClone);
3424 }
3425
3426 EI++;
3427 }
3428 }
3429 if (VerifyCCG) {
3430 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3431 for (const auto &PE : Node->CalleeEdges)
3432 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3433 for (const auto &CE : Node->CallerEdges)
3434 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3435 for (auto *Clone : Node->Clones) {
3436 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3437 for (const auto &PE : Clone->CalleeEdges)
3438 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3439 for (const auto &CE : Clone->CallerEdges)
3440 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3441 }
3442 }
3443 }
3444 }
3445
3446 auto UpdateCalls = [&](ContextNode *Node,
3448 auto &&UpdateCalls) {
3449 auto Inserted = Visited.insert(Node);
3450 if (!Inserted.second)
3451 return;
3452
3453 for (auto *Clone : Node->Clones)
3454 UpdateCalls(Clone, Visited, UpdateCalls);
3455
3456 for (auto &Edge : Node->CallerEdges)
3457 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3458
3459 // Skip if either no call to update, or if we ended up with no context ids
3460 // (we moved all edges onto other clones).
3461 if (!Node->hasCall() || Node->emptyContextIds())
3462 return;
3463
3464 if (Node->IsAllocation) {
3465 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3466 assert(Node->MatchingCalls.empty());
3467 return;
3468 }
3469
3470 if (!CallsiteToCalleeFuncCloneMap.count(Node))
3471 return;
3472
3473 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3474 updateCall(Node->Call, CalleeFunc);
3475 // Update all the matching calls as well.
3476 for (auto &Call : Node->MatchingCalls)
3477 updateCall(Call, CalleeFunc);
3478 };
3479
3480 // Performs DFS traversal starting from allocation nodes to update calls to
3481 // reflect cloning decisions recorded earlier. For regular LTO this will
3482 // update the actual calls in the IR to call the appropriate function clone
3483 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3484 // are recorded in the summary entries.
3486 for (auto &Entry : AllocationCallToContextNodeMap)
3487 UpdateCalls(Entry.second, Visited, UpdateCalls);
3488
3489 return Changed;
3490}
3491
3493 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3495 &FuncToAliasMap) {
3496 // The first "clone" is the original copy, we should only call this if we
3497 // needed to create new clones.
3498 assert(NumClones > 1);
3500 VMaps.reserve(NumClones - 1);
3501 FunctionsClonedThinBackend++;
3502 for (unsigned I = 1; I < NumClones; I++) {
3503 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3504 auto *NewF = CloneFunction(&F, *VMaps.back());
3505 FunctionClonesThinBackend++;
3506 // Strip memprof and callsite metadata from clone as they are no longer
3507 // needed.
3508 for (auto &BB : *NewF) {
3509 for (auto &Inst : BB) {
3510 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3511 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3512 }
3513 }
3514 std::string Name = getMemProfFuncName(F.getName(), I);
3515 auto *PrevF = M.getFunction(Name);
3516 if (PrevF) {
3517 // We might have created this when adjusting callsite in another
3518 // function. It should be a declaration.
3519 assert(PrevF->isDeclaration());
3520 NewF->takeName(PrevF);
3521 PrevF->replaceAllUsesWith(NewF);
3522 PrevF->eraseFromParent();
3523 } else
3524 NewF->setName(Name);
3525 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3526 << "created clone " << ore::NV("NewFunction", NewF));
3527
3528 // Now handle aliases to this function, and clone those as well.
3529 if (!FuncToAliasMap.count(&F))
3530 continue;
3531 for (auto *A : FuncToAliasMap[&F]) {
3532 std::string Name = getMemProfFuncName(A->getName(), I);
3533 auto *PrevA = M.getNamedAlias(Name);
3534 auto *NewA = GlobalAlias::create(A->getValueType(),
3535 A->getType()->getPointerAddressSpace(),
3536 A->getLinkage(), Name, NewF);
3537 NewA->copyAttributesFrom(A);
3538 if (PrevA) {
3539 // We might have created this when adjusting callsite in another
3540 // function. It should be a declaration.
3541 assert(PrevA->isDeclaration());
3542 NewA->takeName(PrevA);
3543 PrevA->replaceAllUsesWith(NewA);
3544 PrevA->eraseFromParent();
3545 }
3546 }
3547 }
3548 return VMaps;
3549}
3550
3551// Locate the summary for F. This is complicated by the fact that it might
3552// have been internalized or promoted.
3554 const ModuleSummaryIndex *ImportSummary) {
3555 // FIXME: Ideally we would retain the original GUID in some fashion on the
3556 // function (e.g. as metadata), but for now do our best to locate the
3557 // summary without that information.
3558 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3559 if (!TheFnVI)
3560 // See if theFn was internalized, by checking index directly with
3561 // original name (this avoids the name adjustment done by getGUID() for
3562 // internal symbols).
3563 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3564 if (TheFnVI)
3565 return TheFnVI;
3566 // Now query with the original name before any promotion was performed.
3567 StringRef OrigName =
3569 std::string OrigId = GlobalValue::getGlobalIdentifier(
3570 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3571 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3572 if (TheFnVI)
3573 return TheFnVI;
3574 // Could be a promoted local imported from another module. We need to pass
3575 // down more info here to find the original module id. For now, try with
3576 // the OrigName which might have been stored in the OidGuidMap in the
3577 // index. This would not work if there were same-named locals in multiple
3578 // modules, however.
3579 auto OrigGUID =
3580 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3581 if (OrigGUID)
3582 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3583 return TheFnVI;
3584}
3585
3586bool MemProfContextDisambiguation::applyImport(Module &M) {
3587 assert(ImportSummary);
3588 bool Changed = false;
3589
3590 auto IsMemProfClone = [](const Function &F) {
3591 return F.getName().contains(MemProfCloneSuffix);
3592 };
3593
3594 // We also need to clone any aliases that reference cloned functions, because
3595 // the modified callsites may invoke via the alias. Keep track of the aliases
3596 // for each function.
3597 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3598 FuncToAliasMap;
3599 for (auto &A : M.aliases()) {
3600 auto *Aliasee = A.getAliaseeObject();
3601 if (auto *F = dyn_cast<Function>(Aliasee))
3602 FuncToAliasMap[F].insert(&A);
3603 }
3604
3605 for (auto &F : M) {
3606 if (F.isDeclaration() || IsMemProfClone(F))
3607 continue;
3608
3610
3612 bool ClonesCreated = false;
3613 unsigned NumClonesCreated = 0;
3614 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3615 // We should at least have version 0 which is the original copy.
3616 assert(NumClones > 0);
3617 // If only one copy needed use original.
3618 if (NumClones == 1)
3619 return;
3620 // If we already performed cloning of this function, confirm that the
3621 // requested number of clones matches (the thin link should ensure the
3622 // number of clones for each constituent callsite is consistent within
3623 // each function), before returning.
3624 if (ClonesCreated) {
3625 assert(NumClonesCreated == NumClones);
3626 return;
3627 }
3628 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3629 // The first "clone" is the original copy, which doesn't have a VMap.
3630 assert(VMaps.size() == NumClones - 1);
3631 Changed = true;
3632 ClonesCreated = true;
3633 NumClonesCreated = NumClones;
3634 };
3635
3636 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3637 Function *CalledFunction) {
3638 // Perform cloning if not yet done.
3639 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3640
3641 // Should have skipped indirect calls via mayHaveMemprofSummary.
3642 assert(CalledFunction);
3643 assert(!IsMemProfClone(*CalledFunction));
3644
3645 // Update the calls per the summary info.
3646 // Save orig name since it gets updated in the first iteration
3647 // below.
3648 auto CalleeOrigName = CalledFunction->getName();
3649 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3650 // Do nothing if this version calls the original version of its
3651 // callee.
3652 if (!StackNode.Clones[J])
3653 continue;
3654 auto NewF = M.getOrInsertFunction(
3655 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3656 CalledFunction->getFunctionType());
3657 CallBase *CBClone;
3658 // Copy 0 is the original function.
3659 if (!J)
3660 CBClone = CB;
3661 else
3662 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3663 CBClone->setCalledFunction(NewF);
3664 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3665 << ore::NV("Call", CBClone) << " in clone "
3666 << ore::NV("Caller", CBClone->getFunction())
3667 << " assigned to call function clone "
3668 << ore::NV("Callee", NewF.getCallee()));
3669 }
3670 };
3671
3672 // Locate the summary for F.
3673 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3674 // If not found, this could be an imported local (see comment in
3675 // findValueInfoForFunc). Skip for now as it will be cloned in its original
3676 // module (where it would have been promoted to global scope so should
3677 // satisfy any reference in this module).
3678 if (!TheFnVI)
3679 continue;
3680
3681 auto *GVSummary =
3682 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3683 if (!GVSummary) {
3684 // Must have been imported, use the summary which matches the definition。
3685 // (might be multiple if this was a linkonce_odr).
3686 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
3687 assert(SrcModuleMD &&
3688 "enable-import-metadata is needed to emit thinlto_src_module");
3689 StringRef SrcModule =
3690 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
3691 for (auto &GVS : TheFnVI.getSummaryList()) {
3692 if (GVS->modulePath() == SrcModule) {
3693 GVSummary = GVS.get();
3694 break;
3695 }
3696 }
3697 assert(GVSummary && GVSummary->modulePath() == SrcModule);
3698 }
3699
3700 // If this was an imported alias skip it as we won't have the function
3701 // summary, and it should be cloned in the original module.
3702 if (isa<AliasSummary>(GVSummary))
3703 continue;
3704
3705 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3706
3707 if (FS->allocs().empty() && FS->callsites().empty())
3708 continue;
3709
3710 auto SI = FS->callsites().begin();
3711 auto AI = FS->allocs().begin();
3712
3713 // To handle callsite infos synthesized for tail calls which have missing
3714 // frames in the profiled context, map callee VI to the synthesized callsite
3715 // info.
3716 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3717 // Iterate the callsites for this function in reverse, since we place all
3718 // those synthesized for tail calls at the end.
3719 for (auto CallsiteIt = FS->callsites().rbegin();
3720 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3721 auto &Callsite = *CallsiteIt;
3722 // Stop as soon as we see a non-synthesized callsite info (see comment
3723 // above loop). All the entries added for discovered tail calls have empty
3724 // stack ids.
3725 if (!Callsite.StackIdIndices.empty())
3726 break;
3727 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3728 }
3729
3730 // Assume for now that the instructions are in the exact same order
3731 // as when the summary was created, but confirm this is correct by
3732 // matching the stack ids.
3733 for (auto &BB : F) {
3734 for (auto &I : BB) {
3735 auto *CB = dyn_cast<CallBase>(&I);
3736 // Same handling as when creating module summary.
3737 if (!mayHaveMemprofSummary(CB))
3738 continue;
3739
3740 auto *CalledValue = CB->getCalledOperand();
3741 auto *CalledFunction = CB->getCalledFunction();
3742 if (CalledValue && !CalledFunction) {
3743 CalledValue = CalledValue->stripPointerCasts();
3744 // Stripping pointer casts can reveal a called function.
3745 CalledFunction = dyn_cast<Function>(CalledValue);
3746 }
3747 // Check if this is an alias to a function. If so, get the
3748 // called aliasee for the checks below.
3749 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3750 assert(!CalledFunction &&
3751 "Expected null called function in callsite for alias");
3752 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3753 }
3754
3756 I.getMetadata(LLVMContext::MD_callsite));
3757 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3758
3759 // Include allocs that were already assigned a memprof function
3760 // attribute in the statistics.
3761 if (CB->getAttributes().hasFnAttr("memprof")) {
3762 assert(!MemProfMD);
3763 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3764 ? AllocTypeColdThinBackend++
3765 : AllocTypeNotColdThinBackend++;
3766 OrigAllocsThinBackend++;
3767 AllocVersionsThinBackend++;
3768 if (!MaxAllocVersionsThinBackend)
3769 MaxAllocVersionsThinBackend = 1;
3770 // Remove any remaining callsite metadata and we can skip the rest of
3771 // the handling for this instruction, since no cloning needed.
3772 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3773 continue;
3774 }
3775
3776 if (MemProfMD) {
3777 // Consult the next alloc node.
3778 assert(AI != FS->allocs().end());
3779 auto &AllocNode = *(AI++);
3780
3781 // Sanity check that the MIB stack ids match between the summary and
3782 // instruction metadata.
3783 auto MIBIter = AllocNode.MIBs.begin();
3784 for (auto &MDOp : MemProfMD->operands()) {
3785 assert(MIBIter != AllocNode.MIBs.end());
3786 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3787 MIBIter->StackIdIndices.begin();
3788 auto *MIBMD = cast<const MDNode>(MDOp);
3789 MDNode *StackMDNode = getMIBStackNode(MIBMD);
3790 assert(StackMDNode);
3791 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3792 auto ContextIterBegin =
3793 StackContext.beginAfterSharedPrefix(CallsiteContext);
3794 // Skip the checking on the first iteration.
3795 uint64_t LastStackContextId =
3796 (ContextIterBegin != StackContext.end() &&
3797 *ContextIterBegin == 0)
3798 ? 1
3799 : 0;
3800 for (auto ContextIter = ContextIterBegin;
3801 ContextIter != StackContext.end(); ++ContextIter) {
3802 // If this is a direct recursion, simply skip the duplicate
3803 // entries, to be consistent with how the summary ids were
3804 // generated during ModuleSummaryAnalysis.
3805 if (LastStackContextId == *ContextIter)
3806 continue;
3807 LastStackContextId = *ContextIter;
3808 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3809 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3810 *ContextIter);
3811 StackIdIndexIter++;
3812 }
3813 MIBIter++;
3814 }
3815
3816 // Perform cloning if not yet done.
3817 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3818
3819 OrigAllocsThinBackend++;
3820 AllocVersionsThinBackend += AllocNode.Versions.size();
3821 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3822 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3823
3824 // If there is only one version that means we didn't end up
3825 // considering this function for cloning, and in that case the alloc
3826 // will still be none type or should have gotten the default NotCold.
3827 // Skip that after calling clone helper since that does some sanity
3828 // checks that confirm we haven't decided yet that we need cloning.
3829 if (AllocNode.Versions.size() == 1) {
3830 assert((AllocationType)AllocNode.Versions[0] ==
3832 (AllocationType)AllocNode.Versions[0] ==
3834 UnclonableAllocsThinBackend++;
3835 continue;
3836 }
3837
3838 // All versions should have a singular allocation type.
3839 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3840 return Type == ((uint8_t)AllocationType::NotCold |
3841 (uint8_t)AllocationType::Cold);
3842 }));
3843
3844 // Update the allocation types per the summary info.
3845 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3846 // Ignore any that didn't get an assigned allocation type.
3847 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3848 continue;
3849 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3850 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3851 : AllocTypeNotColdThinBackend++;
3852 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3853 auto A = llvm::Attribute::get(F.getContext(), "memprof",
3854 AllocTypeString);
3855 CallBase *CBClone;
3856 // Copy 0 is the original function.
3857 if (!J)
3858 CBClone = CB;
3859 else
3860 // Since VMaps are only created for new clones, we index with
3861 // clone J-1 (J==0 is the original clone and does not have a VMaps
3862 // entry).
3863 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3864 CBClone->addFnAttr(A);
3865 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3866 << ore::NV("AllocationCall", CBClone) << " in clone "
3867 << ore::NV("Caller", CBClone->getFunction())
3868 << " marked with memprof allocation attribute "
3869 << ore::NV("Attribute", AllocTypeString));
3870 }
3871 } else if (!CallsiteContext.empty()) {
3872 // Consult the next callsite node.
3873 assert(SI != FS->callsites().end());
3874 auto &StackNode = *(SI++);
3875
3876#ifndef NDEBUG
3877 // Sanity check that the stack ids match between the summary and
3878 // instruction metadata.
3879 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3880 for (auto StackId : CallsiteContext) {
3881 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3882 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3883 StackId);
3884 StackIdIndexIter++;
3885 }
3886#endif
3887
3888 CloneCallsite(StackNode, CB, CalledFunction);
3889 } else if (CB->isTailCall()) {
3890 // Locate the synthesized callsite info for the callee VI, if any was
3891 // created, and use that for cloning.
3892 ValueInfo CalleeVI =
3893 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3894 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3895 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3896 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3897 CloneCallsite(Callsite->second, CB, CalledFunction);
3898 }
3899 }
3900 // Memprof and callsite metadata on memory allocations no longer needed.
3901 I.setMetadata(LLVMContext::MD_memprof, nullptr);
3902 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3903 }
3904 }
3905 }
3906
3907 return Changed;
3908}
3909
3910template <typename DerivedCCG, typename FuncTy, typename CallTy>
3911bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3912 if (DumpCCG) {
3913 dbgs() << "CCG before cloning:\n";
3914 dbgs() << *this;
3915 }
3916 if (ExportToDot)
3917 exportToDot("postbuild");
3918
3919 if (VerifyCCG) {
3920 check();
3921 }
3922
3923 identifyClones();
3924
3925 if (VerifyCCG) {
3926 check();
3927 }
3928
3929 if (DumpCCG) {
3930 dbgs() << "CCG after cloning:\n";
3931 dbgs() << *this;
3932 }
3933 if (ExportToDot)
3934 exportToDot("cloned");
3935
3936 bool Changed = assignFunctions();
3937
3938 if (DumpCCG) {
3939 dbgs() << "CCG after assigning function clones:\n";
3940 dbgs() << *this;
3941 }
3942 if (ExportToDot)
3943 exportToDot("clonefuncassign");
3944
3946 printTotalSizes(errs());
3947
3948 return Changed;
3949}
3950
3951bool MemProfContextDisambiguation::processModule(
3952 Module &M,
3954
3955 // If we have an import summary, then the cloning decisions were made during
3956 // the thin link on the index. Apply them and return.
3957 if (ImportSummary)
3958 return applyImport(M);
3959
3960 // TODO: If/when other types of memprof cloning are enabled beyond just for
3961 // hot and cold, we will need to change this to individually control the
3962 // AllocationType passed to addStackNodesForMIB during CCG construction.
3963 // Note that we specifically check this after applying imports above, so that
3964 // the option isn't needed to be passed to distributed ThinLTO backend
3965 // clang processes, which won't necessarily have visibility into the linker
3966 // dependences. Instead the information is communicated from the LTO link to
3967 // the backends via the combined summary index.
3968 if (!SupportsHotColdNew)
3969 return false;
3970
3971 ModuleCallsiteContextGraph CCG(M, OREGetter);
3972 return CCG.process();
3973}
3974
3976 const ModuleSummaryIndex *Summary)
3977 : ImportSummary(Summary) {
3978 if (ImportSummary) {
3979 // The MemProfImportSummary should only be used for testing ThinLTO
3980 // distributed backend handling via opt, in which case we don't have a
3981 // summary from the pass pipeline.
3983 return;
3984 }
3985 if (MemProfImportSummary.empty())
3986 return;
3987
3988 auto ReadSummaryFile =
3990 if (!ReadSummaryFile) {
3991 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3992 "Error loading file '" + MemProfImportSummary +
3993 "': ");
3994 return;
3995 }
3996 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3997 if (!ImportSummaryForTestingOrErr) {
3998 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3999 "Error parsing file '" + MemProfImportSummary +
4000 "': ");
4001 return;
4002 }
4003 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
4004 ImportSummary = ImportSummaryForTesting.get();
4005}
4006
4009 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
4010 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
4012 };
4013 if (!processModule(M, OREGetter))
4014 return PreservedAnalyses::all();
4015 return PreservedAnalyses::none();
4016}
4017
4021 isPrevailing) {
4022 // TODO: If/when other types of memprof cloning are enabled beyond just for
4023 // hot and cold, we will need to change this to individually control the
4024 // AllocationType passed to addStackNodesForMIB during CCG construction.
4025 // The index was set from the option, so these should be in sync.
4026 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
4027 if (!SupportsHotColdNew)
4028 return;
4029
4030 IndexCallsiteContextGraph CCG(Index, isPrevailing);
4031 CCG.process();
4032}
aarch64 promote const
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
#define check(cond)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
static std::string getAllocTypeString(uint8_t AllocTypes)
cl::opt< bool > MemProfReportHintedSizes
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
Module.h This file contains the declarations for the Module class.
#define P(N)
Module * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
void print(OutputBuffer &OB) const
DomTreeNode::const_iterator end() const
Definition: LoopUnroll.cpp:251
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1574
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1504
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Function summary information to aid decisions and implementation of importing.
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:544
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:409
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:595
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:178
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:563
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
Metadata node.
Definition: Metadata.h:1067
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
std::pair< KeyT, ValueT > & back()
Definition: MapVector.h:85
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_t size() const
Definition: SmallVector.h:91
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition: DenseSet.h:90
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:105
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
StringRef AttributeString(unsigned Attribute)
Definition: Dwarf.cpp:72
@ Entry
Definition: COFF.h:811
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
uint64_t getMIBTotalSize(const MDNode *MIB)
Returns the total size from an MIB metadata node, or 0 if it was not recorded.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:43
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:83
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1226
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1849
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:80
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const