LLVM 19.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
36#include "llvm/IR/Constants.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
45#include "llvm/Transforms/IPO.h"
47#include <sstream>
48#include <unordered_map>
49#include <vector>
50using namespace llvm;
51using namespace llvm::memprof;
52
53#define DEBUG_TYPE "memprof-context-disambiguation"
54
55STATISTIC(FunctionClonesAnalysis,
56 "Number of function clones created during whole program analysis");
57STATISTIC(FunctionClonesThinBackend,
58 "Number of function clones created during ThinLTO backend");
59STATISTIC(FunctionsClonedThinBackend,
60 "Number of functions that had clones created during ThinLTO backend");
61STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
62 "cloned) during whole program analysis");
63STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
64 "during whole program analysis");
65STATISTIC(AllocTypeNotColdThinBackend,
66 "Number of not cold static allocations (possibly cloned) during "
67 "ThinLTO backend");
68STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
69 "(possibly cloned) during ThinLTO backend");
70STATISTIC(OrigAllocsThinBackend,
71 "Number of original (not cloned) allocations with memprof profiles "
72 "during ThinLTO backend");
74 AllocVersionsThinBackend,
75 "Number of allocation versions (including clones) during ThinLTO backend");
76STATISTIC(MaxAllocVersionsThinBackend,
77 "Maximum number of allocation versions created for an original "
78 "allocation during ThinLTO backend");
79STATISTIC(UnclonableAllocsThinBackend,
80 "Number of unclonable ambigous allocations during ThinLTO backend");
81STATISTIC(RemovedEdgesWithMismatchedCallees,
82 "Number of edges removed due to mismatched callees (profiled vs IR)");
83STATISTIC(FoundProfiledCalleeCount,
84 "Number of profiled callees found via tail calls");
85STATISTIC(FoundProfiledCalleeDepth,
86 "Aggregate depth of profiled callees found via tail calls");
87STATISTIC(FoundProfiledCalleeMaxDepth,
88 "Maximum depth of profiled callees found via tail calls");
89STATISTIC(FoundProfiledCalleeNonUniquelyCount,
90 "Number of profiled callees found via multiple tail call chains");
91
93 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
94 cl::value_desc("filename"),
95 cl::desc("Specify the path prefix of the MemProf dot files."));
96
97static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
99 cl::desc("Export graph to dot files."));
100
101static cl::opt<bool>
102 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
103 cl::desc("Dump CallingContextGraph to stdout after each stage."));
104
105static cl::opt<bool>
106 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
107 cl::desc("Perform verification checks on CallingContextGraph."));
108
109static cl::opt<bool>
110 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
111 cl::desc("Perform frequent verification checks on nodes."));
112
114 "memprof-import-summary",
115 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
116 cl::Hidden);
117
119 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
121 cl::desc("Max depth to recursively search for missing "
122 "frames through tail calls."));
123
124namespace llvm {
126 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
127 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
128
129// Indicate we are linking with an allocator that supports hot/cold operator
130// new interfaces.
132 "supports-hot-cold-new", cl::init(false), cl::Hidden,
133 cl::desc("Linking with hot/cold operator new interfaces"));
134} // namespace llvm
135
136namespace {
137/// CRTP base for graphs built from either IR or ThinLTO summary index.
138///
139/// The graph represents the call contexts in all memprof metadata on allocation
140/// calls, with nodes for the allocations themselves, as well as for the calls
141/// in each context. The graph is initially built from the allocation memprof
142/// metadata (or summary) MIBs. It is then updated to match calls with callsite
143/// metadata onto the nodes, updating it to reflect any inlining performed on
144/// those calls.
145///
146/// Each MIB (representing an allocation's call context with allocation
147/// behavior) is assigned a unique context id during the graph build. The edges
148/// and nodes in the graph are decorated with the context ids they carry. This
149/// is used to correctly update the graph when cloning is performed so that we
150/// can uniquify the context for a single (possibly cloned) allocation.
151template <typename DerivedCCG, typename FuncTy, typename CallTy>
152class CallsiteContextGraph {
153public:
154 CallsiteContextGraph() = default;
155 CallsiteContextGraph(const CallsiteContextGraph &) = default;
156 CallsiteContextGraph(CallsiteContextGraph &&) = default;
157
158 /// Main entry point to perform analysis and transformations on graph.
159 bool process();
160
161 /// Perform cloning on the graph necessary to uniquely identify the allocation
162 /// behavior of an allocation based on its context.
163 void identifyClones();
164
165 /// Assign callsite clones to functions, cloning functions as needed to
166 /// accommodate the combinations of their callsite clones reached by callers.
167 /// For regular LTO this clones functions and callsites in the IR, but for
168 /// ThinLTO the cloning decisions are noted in the summaries and later applied
169 /// in applyImport.
170 bool assignFunctions();
171
172 void dump() const;
173 void print(raw_ostream &OS) const;
174
176 const CallsiteContextGraph &CCG) {
177 CCG.print(OS);
178 return OS;
179 }
180
181 friend struct GraphTraits<
182 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
183 friend struct DOTGraphTraits<
184 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
185
186 void exportToDot(std::string Label) const;
187
188 /// Represents a function clone via FuncTy pointer and clone number pair.
189 struct FuncInfo final
190 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
191 using Base = std::pair<FuncTy *, unsigned>;
192 FuncInfo(const Base &B) : Base(B) {}
193 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
194 explicit operator bool() const { return this->first != nullptr; }
195 FuncTy *func() const { return this->first; }
196 unsigned cloneNo() const { return this->second; }
197 };
198
199 /// Represents a callsite clone via CallTy and clone number pair.
200 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
201 using Base = std::pair<CallTy, unsigned>;
202 CallInfo(const Base &B) : Base(B) {}
203 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
204 : Base(Call, CloneNo) {}
205 explicit operator bool() const { return (bool)this->first; }
206 CallTy call() const { return this->first; }
207 unsigned cloneNo() const { return this->second; }
208 void setCloneNo(unsigned N) { this->second = N; }
209 void print(raw_ostream &OS) const {
210 if (!operator bool()) {
211 assert(!cloneNo());
212 OS << "null Call";
213 return;
214 }
215 call()->print(OS);
216 OS << "\t(clone " << cloneNo() << ")";
217 }
218 void dump() const {
219 print(dbgs());
220 dbgs() << "\n";
221 }
222 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
223 Call.print(OS);
224 return OS;
225 }
226 };
227
228 struct ContextEdge;
229
230 /// Node in the Callsite Context Graph
231 struct ContextNode {
232 // Keep this for now since in the IR case where we have an Instruction* it
233 // is not as immediately discoverable. Used for printing richer information
234 // when dumping graph.
235 bool IsAllocation;
236
237 // Keeps track of when the Call was reset to null because there was
238 // recursion.
239 bool Recursive = false;
240
241 // The corresponding allocation or interior call.
243
244 // For alloc nodes this is a unique id assigned when constructed, and for
245 // callsite stack nodes it is the original stack id when the node is
246 // constructed from the memprof MIB metadata on the alloc nodes. Note that
247 // this is only used when matching callsite metadata onto the stack nodes
248 // created when processing the allocation memprof MIBs, and for labeling
249 // nodes in the dot graph. Therefore we don't bother to assign a value for
250 // clones.
251 uint64_t OrigStackOrAllocId = 0;
252
253 // This will be formed by ORing together the AllocationType enum values
254 // for contexts including this node.
255 uint8_t AllocTypes = 0;
256
257 // Edges to all callees in the profiled call stacks.
258 // TODO: Should this be a map (from Callee node) for more efficient lookup?
259 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
260
261 // Edges to all callers in the profiled call stacks.
262 // TODO: Should this be a map (from Caller node) for more efficient lookup?
263 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
264
265 // The set of IDs for contexts including this node.
266 DenseSet<uint32_t> ContextIds;
267
268 // List of clones of this ContextNode, initially empty.
269 std::vector<ContextNode *> Clones;
270
271 // If a clone, points to the original uncloned node.
272 ContextNode *CloneOf = nullptr;
273
274 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
275
276 ContextNode(bool IsAllocation, CallInfo C)
277 : IsAllocation(IsAllocation), Call(C) {}
278
279 void addClone(ContextNode *Clone) {
280 if (CloneOf) {
281 CloneOf->Clones.push_back(Clone);
282 Clone->CloneOf = CloneOf;
283 } else {
284 Clones.push_back(Clone);
285 assert(!Clone->CloneOf);
286 Clone->CloneOf = this;
287 }
288 }
289
290 ContextNode *getOrigNode() {
291 if (!CloneOf)
292 return this;
293 return CloneOf;
294 }
295
296 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
297 unsigned int ContextId);
298
299 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
300 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
301 void eraseCalleeEdge(const ContextEdge *Edge);
302 void eraseCallerEdge(const ContextEdge *Edge);
303
304 void setCall(CallInfo C) { Call = C; }
305
306 bool hasCall() const { return (bool)Call.call(); }
307
308 void printCall(raw_ostream &OS) const { Call.print(OS); }
309
310 // True if this node was effectively removed from the graph, in which case
311 // its context id set, caller edges, and callee edges should all be empty.
312 bool isRemoved() const {
313 // Note that we can have non-empty context ids with empty caller and
314 // callee edges if the graph ends up with a single node.
315 if (ContextIds.empty())
316 assert(CalleeEdges.empty() && CallerEdges.empty() &&
317 "Context ids empty but at least one of callee and caller edges "
318 "were not!");
319 return ContextIds.empty();
320 }
321
322 void dump() const;
323 void print(raw_ostream &OS) const;
324
325 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
326 Node.print(OS);
327 return OS;
328 }
329 };
330
331 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
332 /// callee.
333 struct ContextEdge {
334 ContextNode *Callee;
335 ContextNode *Caller;
336
337 // This will be formed by ORing together the AllocationType enum values
338 // for contexts including this edge.
339 uint8_t AllocTypes = 0;
340
341 // The set of IDs for contexts including this edge.
342 DenseSet<uint32_t> ContextIds;
343
344 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
345 DenseSet<uint32_t> ContextIds)
346 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
347 ContextIds(ContextIds) {}
348
349 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
350
351 void dump() const;
352 void print(raw_ostream &OS) const;
353
354 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
355 Edge.print(OS);
356 return OS;
357 }
358 };
359
360 /// Helpers to remove callee edges that have allocation type None (due to not
361 /// carrying any context ids) after transformations.
362 void removeNoneTypeCalleeEdges(ContextNode *Node);
363 void
364 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
366
367protected:
368 /// Get a list of nodes corresponding to the stack ids in the given callsite
369 /// context.
370 template <class NodeT, class IteratorT>
371 std::vector<uint64_t>
372 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
373
374 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
375 /// metadata (or summary).
376 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
377
378 /// Adds nodes for the given MIB stack ids.
379 template <class NodeT, class IteratorT>
380 void addStackNodesForMIB(ContextNode *AllocNode,
381 CallStack<NodeT, IteratorT> &StackContext,
382 CallStack<NodeT, IteratorT> &CallsiteContext,
384
385 /// Matches all callsite metadata (or summary) to the nodes created for
386 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
387 /// inlining performed on those callsite instructions.
388 void updateStackNodes();
389
390 /// Update graph to conservatively handle any callsite stack nodes that target
391 /// multiple different callee target functions.
392 void handleCallsitesWithMultipleTargets();
393
394 /// Save lists of calls with MemProf metadata in each function, for faster
395 /// iteration.
396 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
397
398 /// Map from callsite node to the enclosing caller function.
399 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
400
401private:
402 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
403
404 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
405 const FuncTy *, DenseSet<uint32_t>>;
406
407 /// Assigns the given Node to calls at or inlined into the location with
408 /// the Node's stack id, after post order traversing and processing its
409 /// caller nodes. Uses the call information recorded in the given
410 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
411 /// as needed. Called by updateStackNodes which sets up the given
412 /// StackIdToMatchingCalls map.
413 void assignStackNodesPostOrder(
414 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
415 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
416
417 /// Duplicates the given set of context ids, updating the provided
418 /// map from each original id with the newly generated context ids,
419 /// and returning the new duplicated id set.
420 DenseSet<uint32_t> duplicateContextIds(
421 const DenseSet<uint32_t> &StackSequenceContextIds,
422 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
423
424 /// Propagates all duplicated context ids across the graph.
425 void propagateDuplicateContextIds(
426 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
427
428 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
429 /// else to its callers. Also updates OrigNode's edges to remove any context
430 /// ids moved to the newly created edge.
431 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
432 bool TowardsCallee);
433
434 /// Get the stack id corresponding to the given Id or Index (for IR this will
435 /// return itself, for a summary index this will return the id recorded in the
436 /// index for that stack id index value).
437 uint64_t getStackId(uint64_t IdOrIndex) const {
438 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
439 }
440
441 /// Returns true if the given call targets the callee of the given edge, or if
442 /// we were able to identify the call chain through intermediate tail calls.
443 /// In the latter case new context nodes are added to the graph for the
444 /// identified tail calls, and their synthesized nodes are added to
445 /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the
446 /// next element after the input position (either incremented or updated after
447 /// removing the old edge).
448 bool
449 calleesMatch(CallTy Call, EdgeIter &EI,
450 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
451
452 /// Returns true if the given call targets the given function, or if we were
453 /// able to identify the call chain through intermediate tail calls (in which
454 /// case FoundCalleeChain will be populated).
455 bool calleeMatchesFunc(
456 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
457 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
458 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
459 Call, Func, CallerFunc, FoundCalleeChain);
460 }
461
462 /// Get a list of nodes corresponding to the stack ids in the given
463 /// callsite's context.
464 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
465 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
466 Call);
467 }
468
469 /// Get the last stack id in the context for callsite.
470 uint64_t getLastStackId(CallTy Call) {
471 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
472 }
473
474 /// Update the allocation call to record type of allocated memory.
475 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
476 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
477 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
478 }
479
480 /// Update non-allocation call to invoke (possibly cloned) function
481 /// CalleeFunc.
482 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
483 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
484 }
485
486 /// Clone the given function for the given callsite, recording mapping of all
487 /// of the functions tracked calls to their new versions in the CallMap.
488 /// Assigns new clones to clone number CloneNo.
489 FuncInfo cloneFunctionForCallsite(
490 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
491 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
492 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
493 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
494 }
495
496 /// Gets a label to use in the dot graph for the given call clone in the given
497 /// function.
498 std::string getLabel(const FuncTy *Func, const CallTy Call,
499 unsigned CloneNo) const {
500 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
501 }
502
503 /// Helpers to find the node corresponding to the given call or stackid.
504 ContextNode *getNodeForInst(const CallInfo &C);
505 ContextNode *getNodeForAlloc(const CallInfo &C);
506 ContextNode *getNodeForStackId(uint64_t StackId);
507
508 /// Removes the node information recorded for the given call.
509 void unsetNodeForInst(const CallInfo &C);
510
511 /// Computes the alloc type corresponding to the given context ids, by
512 /// unioning their recorded alloc types.
513 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
514
515 /// Returns the alloction type of the intersection of the contexts of two
516 /// nodes (based on their provided context id sets), optimized for the case
517 /// when Node1Ids is smaller than Node2Ids.
518 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
519 const DenseSet<uint32_t> &Node2Ids);
520
521 /// Returns the alloction type of the intersection of the contexts of two
522 /// nodes (based on their provided context id sets).
523 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
524 const DenseSet<uint32_t> &Node2Ids);
525
526 /// Create a clone of Edge's callee and move Edge to that new callee node,
527 /// performing the necessary context id and allocation type updates.
528 /// If callee's caller edge iterator is supplied, it is updated when removing
529 /// the edge from that list. If ContextIdsToMove is non-empty, only that
530 /// subset of Edge's ids are moved to an edge to the new callee.
531 ContextNode *
532 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
533 EdgeIter *CallerEdgeI = nullptr,
534 DenseSet<uint32_t> ContextIdsToMove = {});
535
536 /// Change the callee of Edge to existing callee clone NewCallee, performing
537 /// the necessary context id and allocation type updates.
538 /// If callee's caller edge iterator is supplied, it is updated when removing
539 /// the edge from that list. If ContextIdsToMove is non-empty, only that
540 /// subset of Edge's ids are moved to an edge to the new callee.
541 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
542 ContextNode *NewCallee,
543 EdgeIter *CallerEdgeI = nullptr,
544 bool NewClone = false,
545 DenseSet<uint32_t> ContextIdsToMove = {});
546
547 /// Recursively perform cloning on the graph for the given Node and its
548 /// callers, in order to uniquely identify the allocation behavior of an
549 /// allocation given its context. The context ids of the allocation being
550 /// processed are given in AllocContextIds.
551 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
552 const DenseSet<uint32_t> &AllocContextIds);
553
554 /// Map from each context ID to the AllocationType assigned to that context.
555 std::map<uint32_t, AllocationType> ContextIdToAllocationType;
556
557 /// Identifies the context node created for a stack id when adding the MIB
558 /// contexts to the graph. This is used to locate the context nodes when
559 /// trying to assign the corresponding callsites with those stack ids to these
560 /// nodes.
561 std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
562
563 /// Maps to track the calls to their corresponding nodes in the graph.
564 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
565 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
566
567 /// Owner of all ContextNode unique_ptrs.
568 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
569
570 /// Perform sanity checks on graph when requested.
571 void check() const;
572
573 /// Keeps track of the last unique context id assigned.
574 unsigned int LastContextId = 0;
575};
576
577template <typename DerivedCCG, typename FuncTy, typename CallTy>
578using ContextNode =
579 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
580template <typename DerivedCCG, typename FuncTy, typename CallTy>
581using ContextEdge =
582 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
583template <typename DerivedCCG, typename FuncTy, typename CallTy>
584using FuncInfo =
585 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
586template <typename DerivedCCG, typename FuncTy, typename CallTy>
587using CallInfo =
588 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
589
590/// CRTP derived class for graphs built from IR (regular LTO).
591class ModuleCallsiteContextGraph
592 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
593 Instruction *> {
594public:
595 ModuleCallsiteContextGraph(
596 Module &M,
598
599private:
600 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
601 Instruction *>;
602
603 uint64_t getStackId(uint64_t IdOrIndex) const;
604 bool calleeMatchesFunc(
605 Instruction *Call, const Function *Func, const Function *CallerFunc,
606 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
607 bool findProfiledCalleeThroughTailCalls(
608 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
609 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
610 bool &FoundMultipleCalleeChains);
611 uint64_t getLastStackId(Instruction *Call);
612 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
613 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
614 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
615 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
616 Instruction *>::FuncInfo
617 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
618 std::map<CallInfo, CallInfo> &CallMap,
619 std::vector<CallInfo> &CallsWithMetadataInFunc,
620 unsigned CloneNo);
621 std::string getLabel(const Function *Func, const Instruction *Call,
622 unsigned CloneNo) const;
623
624 const Module &Mod;
626};
627
628/// Represents a call in the summary index graph, which can either be an
629/// allocation or an interior callsite node in an allocation's context.
630/// Holds a pointer to the corresponding data structure in the index.
631struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
632 IndexCall() : PointerUnion() {}
633 IndexCall(std::nullptr_t) : IndexCall() {}
634 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
635 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
636 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
637
638 IndexCall *operator->() { return this; }
639
640 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
641
642 void print(raw_ostream &OS) const {
643 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
644 OS << *AI;
645 } else {
646 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
647 assert(CI);
648 OS << *CI;
649 }
650 }
651};
652
653/// CRTP derived class for graphs built from summary index (ThinLTO).
654class IndexCallsiteContextGraph
655 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
656 IndexCall> {
657public:
658 IndexCallsiteContextGraph(
661 isPrevailing);
662
663 ~IndexCallsiteContextGraph() {
664 // Now that we are done with the graph it is safe to add the new
665 // CallsiteInfo structs to the function summary vectors. The graph nodes
666 // point into locations within these vectors, so we don't want to add them
667 // any earlier.
668 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
669 auto *FS = I.first;
670 for (auto &Callsite : I.second)
671 FS->addCallsite(*Callsite.second);
672 }
673 }
674
675private:
676 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
677 IndexCall>;
678
679 uint64_t getStackId(uint64_t IdOrIndex) const;
680 bool calleeMatchesFunc(
681 IndexCall &Call, const FunctionSummary *Func,
682 const FunctionSummary *CallerFunc,
683 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
684 bool findProfiledCalleeThroughTailCalls(
685 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
686 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
687 bool &FoundMultipleCalleeChains);
688 uint64_t getLastStackId(IndexCall &Call);
689 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
690 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
691 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
692 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
693 IndexCall>::FuncInfo
694 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
695 std::map<CallInfo, CallInfo> &CallMap,
696 std::vector<CallInfo> &CallsWithMetadataInFunc,
697 unsigned CloneNo);
698 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
699 unsigned CloneNo) const;
700
701 // Saves mapping from function summaries containing memprof records back to
702 // its VI, for use in checking and debugging.
703 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
704
707 isPrevailing;
708
709 // Saves/owns the callsite info structures synthesized for missing tail call
710 // frames that we discover while building the graph.
711 // It maps from the summary of the function making the tail call, to a map
712 // of callee ValueInfo to corresponding synthesized callsite info.
713 std::unordered_map<FunctionSummary *,
714 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
715 FunctionCalleesToSynthesizedCallsiteInfos;
716};
717} // namespace
718
719namespace llvm {
720template <>
721struct DenseMapInfo<typename CallsiteContextGraph<
722 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
724template <>
725struct DenseMapInfo<typename CallsiteContextGraph<
726 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
727 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
728template <>
729struct DenseMapInfo<IndexCall>
730 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
731} // end namespace llvm
732
733namespace {
734
735struct FieldSeparator {
736 bool Skip = true;
737 const char *Sep;
738
739 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
740};
741
742raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
743 if (FS.Skip) {
744 FS.Skip = false;
745 return OS;
746 }
747 return OS << FS.Sep;
748}
749
750// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
751// type we should actually use on the corresponding allocation.
752// If we can't clone a node that has NotCold+Cold alloc type, we will fall
753// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
754// from NotCold.
755AllocationType allocTypeToUse(uint8_t AllocTypes) {
756 assert(AllocTypes != (uint8_t)AllocationType::None);
757 if (AllocTypes ==
758 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
759 return AllocationType::NotCold;
760 else
761 return (AllocationType)AllocTypes;
762}
763
764// Helper to check if the alloc types for all edges recorded in the
765// InAllocTypes vector match the alloc types for all edges in the Edges
766// vector.
767template <typename DerivedCCG, typename FuncTy, typename CallTy>
768bool allocTypesMatch(
769 const std::vector<uint8_t> &InAllocTypes,
770 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
771 &Edges) {
772 return std::equal(
773 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
774 [](const uint8_t &l,
775 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
776 // Can share if one of the edges is None type - don't
777 // care about the type along that edge as it doesn't
778 // exist for those context ids.
779 if (l == (uint8_t)AllocationType::None ||
780 r->AllocTypes == (uint8_t)AllocationType::None)
781 return true;
782 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
783 });
784}
785
786} // end anonymous namespace
787
788template <typename DerivedCCG, typename FuncTy, typename CallTy>
789typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
790CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
791 const CallInfo &C) {
792 ContextNode *Node = getNodeForAlloc(C);
793 if (Node)
794 return Node;
795
796 return NonAllocationCallToContextNodeMap.lookup(C);
797}
798
799template <typename DerivedCCG, typename FuncTy, typename CallTy>
800typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
801CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
802 const CallInfo &C) {
803 return AllocationCallToContextNodeMap.lookup(C);
804}
805
806template <typename DerivedCCG, typename FuncTy, typename CallTy>
807typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
808CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
809 uint64_t StackId) {
810 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
811 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
812 return StackEntryNode->second;
813 return nullptr;
814}
815
816template <typename DerivedCCG, typename FuncTy, typename CallTy>
817void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
818 const CallInfo &C) {
819 AllocationCallToContextNodeMap.erase(C) ||
820 NonAllocationCallToContextNodeMap.erase(C);
821 assert(!AllocationCallToContextNodeMap.count(C) &&
822 !NonAllocationCallToContextNodeMap.count(C));
823}
824
825template <typename DerivedCCG, typename FuncTy, typename CallTy>
826void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
827 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
828 unsigned int ContextId) {
829 for (auto &Edge : CallerEdges) {
830 if (Edge->Caller == Caller) {
831 Edge->AllocTypes |= (uint8_t)AllocType;
832 Edge->getContextIds().insert(ContextId);
833 return;
834 }
835 }
836 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
837 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
838 CallerEdges.push_back(Edge);
839 Caller->CalleeEdges.push_back(Edge);
840}
841
842template <typename DerivedCCG, typename FuncTy, typename CallTy>
843void CallsiteContextGraph<
844 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
845 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
846 auto Edge = *EI;
847 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
848 assert(Edge->ContextIds.empty());
849 Edge->Callee->eraseCallerEdge(Edge.get());
850 EI = Node->CalleeEdges.erase(EI);
851 } else
852 ++EI;
853 }
854}
855
856template <typename DerivedCCG, typename FuncTy, typename CallTy>
857typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
858CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
859 findEdgeFromCallee(const ContextNode *Callee) {
860 for (const auto &Edge : CalleeEdges)
861 if (Edge->Callee == Callee)
862 return Edge.get();
863 return nullptr;
864}
865
866template <typename DerivedCCG, typename FuncTy, typename CallTy>
867typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
868CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
869 findEdgeFromCaller(const ContextNode *Caller) {
870 for (const auto &Edge : CallerEdges)
871 if (Edge->Caller == Caller)
872 return Edge.get();
873 return nullptr;
874}
875
876template <typename DerivedCCG, typename FuncTy, typename CallTy>
877void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
878 eraseCalleeEdge(const ContextEdge *Edge) {
879 auto EI = llvm::find_if(
880 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
881 return CalleeEdge.get() == Edge;
882 });
883 assert(EI != CalleeEdges.end());
884 CalleeEdges.erase(EI);
885}
886
887template <typename DerivedCCG, typename FuncTy, typename CallTy>
888void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
889 eraseCallerEdge(const ContextEdge *Edge) {
890 auto EI = llvm::find_if(
891 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
892 return CallerEdge.get() == Edge;
893 });
894 assert(EI != CallerEdges.end());
895 CallerEdges.erase(EI);
896}
897
898template <typename DerivedCCG, typename FuncTy, typename CallTy>
899uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
900 DenseSet<uint32_t> &ContextIds) {
901 uint8_t BothTypes =
902 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
903 uint8_t AllocType = (uint8_t)AllocationType::None;
904 for (auto Id : ContextIds) {
905 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
906 // Bail early if alloc type reached both, no further refinement.
907 if (AllocType == BothTypes)
908 return AllocType;
909 }
910 return AllocType;
911}
912
913template <typename DerivedCCG, typename FuncTy, typename CallTy>
914uint8_t
915CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
916 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
917 uint8_t BothTypes =
918 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
919 uint8_t AllocType = (uint8_t)AllocationType::None;
920 for (auto Id : Node1Ids) {
921 if (!Node2Ids.count(Id))
922 continue;
923 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
924 // Bail early if alloc type reached both, no further refinement.
925 if (AllocType == BothTypes)
926 return AllocType;
927 }
928 return AllocType;
929}
930
931template <typename DerivedCCG, typename FuncTy, typename CallTy>
932uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
933 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
934 if (Node1Ids.size() < Node2Ids.size())
935 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
936 else
937 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
938}
939
940template <typename DerivedCCG, typename FuncTy, typename CallTy>
941typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
942CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
943 CallInfo Call, const FuncTy *F) {
944 assert(!getNodeForAlloc(Call));
945 NodeOwner.push_back(
946 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
947 ContextNode *AllocNode = NodeOwner.back().get();
948 AllocationCallToContextNodeMap[Call] = AllocNode;
949 NodeToCallingFunc[AllocNode] = F;
950 // Use LastContextId as a uniq id for MIB allocation nodes.
951 AllocNode->OrigStackOrAllocId = LastContextId;
952 // Alloc type should be updated as we add in the MIBs. We should assert
953 // afterwards that it is not still None.
954 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
955
956 return AllocNode;
957}
958
959template <typename DerivedCCG, typename FuncTy, typename CallTy>
960template <class NodeT, class IteratorT>
961void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
962 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
964 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
965 // is done.
966 if (AllocType == AllocationType::Hot)
967 AllocType = AllocationType::NotCold;
968
969 ContextIdToAllocationType[++LastContextId] = AllocType;
970
971 // Update alloc type and context ids for this MIB.
972 AllocNode->AllocTypes |= (uint8_t)AllocType;
973 AllocNode->ContextIds.insert(LastContextId);
974
975 // Now add or update nodes for each stack id in alloc's context.
976 // Later when processing the stack ids on non-alloc callsites we will adjust
977 // for any inlining in the context.
978 ContextNode *PrevNode = AllocNode;
979 // Look for recursion (direct recursion should have been collapsed by
980 // module summary analysis, here we should just be detecting mutual
981 // recursion). Mark these nodes so we don't try to clone.
982 SmallSet<uint64_t, 8> StackIdSet;
983 // Skip any on the allocation call (inlining).
984 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
985 ContextIter != StackContext.end(); ++ContextIter) {
986 auto StackId = getStackId(*ContextIter);
987 ContextNode *StackNode = getNodeForStackId(StackId);
988 if (!StackNode) {
989 NodeOwner.push_back(
990 std::make_unique<ContextNode>(/*IsAllocation=*/false));
991 StackNode = NodeOwner.back().get();
992 StackEntryIdToContextNodeMap[StackId] = StackNode;
993 StackNode->OrigStackOrAllocId = StackId;
994 }
995 auto Ins = StackIdSet.insert(StackId);
996 if (!Ins.second)
997 StackNode->Recursive = true;
998 StackNode->ContextIds.insert(LastContextId);
999 StackNode->AllocTypes |= (uint8_t)AllocType;
1000 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1001 PrevNode = StackNode;
1002 }
1003}
1004
1005template <typename DerivedCCG, typename FuncTy, typename CallTy>
1007CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1008 const DenseSet<uint32_t> &StackSequenceContextIds,
1009 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1010 DenseSet<uint32_t> NewContextIds;
1011 for (auto OldId : StackSequenceContextIds) {
1012 NewContextIds.insert(++LastContextId);
1013 OldToNewContextIds[OldId].insert(LastContextId);
1014 assert(ContextIdToAllocationType.count(OldId));
1015 // The new context has the same allocation type as original.
1016 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1017 }
1018 return NewContextIds;
1019}
1020
1021template <typename DerivedCCG, typename FuncTy, typename CallTy>
1022void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1023 propagateDuplicateContextIds(
1024 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1025 // Build a set of duplicated context ids corresponding to the input id set.
1026 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1027 DenseSet<uint32_t> NewIds;
1028 for (auto Id : ContextIds)
1029 if (auto NewId = OldToNewContextIds.find(Id);
1030 NewId != OldToNewContextIds.end())
1031 NewIds.insert(NewId->second.begin(), NewId->second.end());
1032 return NewIds;
1033 };
1034
1035 // Recursively update context ids sets along caller edges.
1036 auto UpdateCallers = [&](ContextNode *Node,
1038 auto &&UpdateCallers) -> void {
1039 for (const auto &Edge : Node->CallerEdges) {
1040 auto Inserted = Visited.insert(Edge.get());
1041 if (!Inserted.second)
1042 continue;
1043 ContextNode *NextNode = Edge->Caller;
1044 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1045 // Only need to recursively iterate to NextNode via this caller edge if
1046 // it resulted in any added ids to NextNode.
1047 if (!NewIdsToAdd.empty()) {
1048 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1049 NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1050 UpdateCallers(NextNode, Visited, UpdateCallers);
1051 }
1052 }
1053 };
1054
1056 for (auto &Entry : AllocationCallToContextNodeMap) {
1057 auto *Node = Entry.second;
1058 // Update ids on the allocation nodes before calling the recursive
1059 // update along caller edges, since this simplifies the logic during
1060 // that traversal.
1061 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
1062 Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1063 UpdateCallers(Node, Visited, UpdateCallers);
1064 }
1065}
1066
1067template <typename DerivedCCG, typename FuncTy, typename CallTy>
1068void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1069 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
1070 // Make a copy of the context ids, since this will be adjusted below as they
1071 // are moved.
1072 DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
1073 auto &OrigEdges =
1074 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1075 // Increment iterator in loop so that we can remove edges as needed.
1076 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1077 auto Edge = *EI;
1078 // Remove any matching context ids from Edge, return set that were found and
1079 // removed, these are the new edge's context ids. Also update the remaining
1080 // (not found ids).
1081 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1082 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1083 NotFoundContextIds);
1084 RemainingContextIds.swap(NotFoundContextIds);
1085 // If no matching context ids for this edge, skip it.
1086 if (NewEdgeContextIds.empty()) {
1087 ++EI;
1088 continue;
1089 }
1090 if (TowardsCallee) {
1091 auto NewEdge = std::make_shared<ContextEdge>(
1092 Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
1093 NewEdgeContextIds);
1094 NewNode->CalleeEdges.push_back(NewEdge);
1095 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1096 } else {
1097 auto NewEdge = std::make_shared<ContextEdge>(
1098 NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
1099 NewEdgeContextIds);
1100 NewNode->CallerEdges.push_back(NewEdge);
1101 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1102 }
1103 // Remove old edge if context ids empty.
1104 if (Edge->getContextIds().empty()) {
1105 if (TowardsCallee) {
1106 Edge->Callee->eraseCallerEdge(Edge.get());
1107 EI = OrigNode->CalleeEdges.erase(EI);
1108 } else {
1109 Edge->Caller->eraseCalleeEdge(Edge.get());
1110 EI = OrigNode->CallerEdges.erase(EI);
1111 }
1112 continue;
1113 }
1114 ++EI;
1115 }
1116}
1117
1118template <typename DerivedCCG, typename FuncTy, typename CallTy>
1119void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1120 assignStackNodesPostOrder(ContextNode *Node,
1122 DenseMap<uint64_t, std::vector<CallContextInfo>>
1123 &StackIdToMatchingCalls) {
1124 auto Inserted = Visited.insert(Node);
1125 if (!Inserted.second)
1126 return;
1127 // Post order traversal. Iterate over a copy since we may add nodes and
1128 // therefore new callers during the recursive call, invalidating any
1129 // iterator over the original edge vector. We don't need to process these
1130 // new nodes as they were already processed on creation.
1131 auto CallerEdges = Node->CallerEdges;
1132 for (auto &Edge : CallerEdges) {
1133 // Skip any that have been removed during the recursion.
1134 if (!Edge)
1135 continue;
1136 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1137 }
1138
1139 // If this node's stack id is in the map, update the graph to contain new
1140 // nodes representing any inlining at interior callsites. Note we move the
1141 // associated context ids over to the new nodes.
1142
1143 // Ignore this node if it is for an allocation or we didn't record any
1144 // stack id lists ending at it.
1145 if (Node->IsAllocation ||
1146 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1147 return;
1148
1149 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1150 // Handle the simple case first. A single call with a single stack id.
1151 // In this case there is no need to create any new context nodes, simply
1152 // assign the context node for stack id to this Call.
1153 if (Calls.size() == 1) {
1154 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1155 if (Ids.size() == 1) {
1156 assert(SavedContextIds.empty());
1157 // It should be this Node
1158 assert(Node == getNodeForStackId(Ids[0]));
1159 if (Node->Recursive)
1160 return;
1161 Node->setCall(Call);
1162 NonAllocationCallToContextNodeMap[Call] = Node;
1163 NodeToCallingFunc[Node] = Func;
1164 return;
1165 }
1166 }
1167
1168 // Find the node for the last stack id, which should be the same
1169 // across all calls recorded for this id, and is this node's id.
1170 uint64_t LastId = Node->OrigStackOrAllocId;
1171 ContextNode *LastNode = getNodeForStackId(LastId);
1172 // We should only have kept stack ids that had nodes.
1173 assert(LastNode);
1174
1175 for (unsigned I = 0; I < Calls.size(); I++) {
1176 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1177 // Skip any for which we didn't assign any ids, these don't get a node in
1178 // the graph.
1179 if (SavedContextIds.empty())
1180 continue;
1181
1182 assert(LastId == Ids.back());
1183
1184 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1185 assert(FirstNode);
1186
1187 // Recompute the context ids for this stack id sequence (the
1188 // intersection of the context ids of the corresponding nodes).
1189 // Start with the ids we saved in the map for this call, which could be
1190 // duplicated context ids. We have to recompute as we might have overlap
1191 // overlap between the saved context ids for different last nodes, and
1192 // removed them already during the post order traversal.
1193 set_intersect(SavedContextIds, FirstNode->ContextIds);
1194 ContextNode *PrevNode = nullptr;
1195 for (auto Id : Ids) {
1196 ContextNode *CurNode = getNodeForStackId(Id);
1197 // We should only have kept stack ids that had nodes and weren't
1198 // recursive.
1199 assert(CurNode);
1200 assert(!CurNode->Recursive);
1201 if (!PrevNode) {
1202 PrevNode = CurNode;
1203 continue;
1204 }
1205 auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1206 if (!Edge) {
1207 SavedContextIds.clear();
1208 break;
1209 }
1210 PrevNode = CurNode;
1211 set_intersect(SavedContextIds, Edge->getContextIds());
1212
1213 // If we now have no context ids for clone, skip this call.
1214 if (SavedContextIds.empty())
1215 break;
1216 }
1217 if (SavedContextIds.empty())
1218 continue;
1219
1220 // Create new context node.
1221 NodeOwner.push_back(
1222 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1223 ContextNode *NewNode = NodeOwner.back().get();
1224 NodeToCallingFunc[NewNode] = Func;
1225 NonAllocationCallToContextNodeMap[Call] = NewNode;
1226 NewNode->ContextIds = SavedContextIds;
1227 NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
1228
1229 // Connect to callees of innermost stack frame in inlined call chain.
1230 // This updates context ids for FirstNode's callee's to reflect those
1231 // moved to NewNode.
1232 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
1233
1234 // Connect to callers of outermost stack frame in inlined call chain.
1235 // This updates context ids for FirstNode's caller's to reflect those
1236 // moved to NewNode.
1237 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
1238
1239 // Now we need to remove context ids from edges/nodes between First and
1240 // Last Node.
1241 PrevNode = nullptr;
1242 for (auto Id : Ids) {
1243 ContextNode *CurNode = getNodeForStackId(Id);
1244 // We should only have kept stack ids that had nodes.
1245 assert(CurNode);
1246
1247 // Remove the context ids moved to NewNode from CurNode, and the
1248 // edge from the prior node.
1249 set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1250 if (PrevNode) {
1251 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1252 assert(PrevEdge);
1253 set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1254 if (PrevEdge->getContextIds().empty()) {
1255 PrevNode->eraseCallerEdge(PrevEdge);
1256 CurNode->eraseCalleeEdge(PrevEdge);
1257 }
1258 }
1259 PrevNode = CurNode;
1260 }
1261 }
1262}
1263
1264template <typename DerivedCCG, typename FuncTy, typename CallTy>
1265void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1266 // Map of stack id to all calls with that as the last (outermost caller)
1267 // callsite id that has a context node (some might not due to pruning
1268 // performed during matching of the allocation profile contexts).
1269 // The CallContextInfo contains the Call and a list of its stack ids with
1270 // ContextNodes, the function containing Call, and the set of context ids
1271 // the analysis will eventually identify for use in any new node created
1272 // for that callsite.
1274 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1275 for (auto &Call : CallsWithMetadata) {
1276 // Ignore allocations, already handled.
1277 if (AllocationCallToContextNodeMap.count(Call))
1278 continue;
1279 auto StackIdsWithContextNodes =
1280 getStackIdsWithContextNodesForCall(Call.call());
1281 // If there were no nodes created for MIBs on allocs (maybe this was in
1282 // the unambiguous part of the MIB stack that was pruned), ignore.
1283 if (StackIdsWithContextNodes.empty())
1284 continue;
1285 // Otherwise, record this Call along with the list of ids for the last
1286 // (outermost caller) stack id with a node.
1287 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1288 {Call.call(), StackIdsWithContextNodes, Func, {}});
1289 }
1290 }
1291
1292 // First make a pass through all stack ids that correspond to a call,
1293 // as identified in the above loop. Compute the context ids corresponding to
1294 // each of these calls when they correspond to multiple stack ids due to
1295 // due to inlining. Perform any duplication of context ids required when
1296 // there is more than one call with the same stack ids. Their (possibly newly
1297 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1298 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1299 for (auto &It : StackIdToMatchingCalls) {
1300 auto &Calls = It.getSecond();
1301 // Skip single calls with a single stack id. These don't need a new node.
1302 if (Calls.size() == 1) {
1303 auto &Ids = std::get<1>(Calls[0]);
1304 if (Ids.size() == 1)
1305 continue;
1306 }
1307 // In order to do the best and maximal matching of inlined calls to context
1308 // node sequences we will sort the vectors of stack ids in descending order
1309 // of length, and within each length, lexicographically by stack id. The
1310 // latter is so that we can specially handle calls that have identical stack
1311 // id sequences (either due to cloning or artificially because of the MIB
1312 // context pruning).
1313 std::stable_sort(Calls.begin(), Calls.end(),
1314 [](const CallContextInfo &A, const CallContextInfo &B) {
1315 auto &IdsA = std::get<1>(A);
1316 auto &IdsB = std::get<1>(B);
1317 return IdsA.size() > IdsB.size() ||
1318 (IdsA.size() == IdsB.size() && IdsA < IdsB);
1319 });
1320
1321 // Find the node for the last stack id, which should be the same
1322 // across all calls recorded for this id, and is the id for this
1323 // entry in the StackIdToMatchingCalls map.
1324 uint64_t LastId = It.getFirst();
1325 ContextNode *LastNode = getNodeForStackId(LastId);
1326 // We should only have kept stack ids that had nodes.
1327 assert(LastNode);
1328
1329 if (LastNode->Recursive)
1330 continue;
1331
1332 // Initialize the context ids with the last node's. We will subsequently
1333 // refine the context ids by computing the intersection along all edges.
1334 DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1335 assert(!LastNodeContextIds.empty());
1336
1337 for (unsigned I = 0; I < Calls.size(); I++) {
1338 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1339 assert(SavedContextIds.empty());
1340 assert(LastId == Ids.back());
1341
1342 // First compute the context ids for this stack id sequence (the
1343 // intersection of the context ids of the corresponding nodes).
1344 // Start with the remaining saved ids for the last node.
1345 assert(!LastNodeContextIds.empty());
1346 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1347
1348 ContextNode *PrevNode = LastNode;
1349 ContextNode *CurNode = LastNode;
1350 bool Skip = false;
1351
1352 // Iterate backwards through the stack Ids, starting after the last Id
1353 // in the list, which was handled once outside for all Calls.
1354 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1355 auto Id = *IdIter;
1356 CurNode = getNodeForStackId(Id);
1357 // We should only have kept stack ids that had nodes.
1358 assert(CurNode);
1359
1360 if (CurNode->Recursive) {
1361 Skip = true;
1362 break;
1363 }
1364
1365 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1366 // If there is no edge then the nodes belong to different MIB contexts,
1367 // and we should skip this inlined context sequence. For example, this
1368 // particular inlined context may include stack ids A->B, and we may
1369 // indeed have nodes for both A and B, but it is possible that they were
1370 // never profiled in sequence in a single MIB for any allocation (i.e.
1371 // we might have profiled an allocation that involves the callsite A,
1372 // but through a different one of its callee callsites, and we might
1373 // have profiled an allocation that involves callsite B, but reached
1374 // from a different caller callsite).
1375 if (!Edge) {
1376 Skip = true;
1377 break;
1378 }
1379 PrevNode = CurNode;
1380
1381 // Update the context ids, which is the intersection of the ids along
1382 // all edges in the sequence.
1383 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1384
1385 // If we now have no context ids for clone, skip this call.
1386 if (StackSequenceContextIds.empty()) {
1387 Skip = true;
1388 break;
1389 }
1390 }
1391 if (Skip)
1392 continue;
1393
1394 // If some of this call's stack ids did not have corresponding nodes (due
1395 // to pruning), don't include any context ids for contexts that extend
1396 // beyond these nodes. Otherwise we would be matching part of unrelated /
1397 // not fully matching stack contexts. To do this, subtract any context ids
1398 // found in caller nodes of the last node found above.
1399 if (Ids.back() != getLastStackId(Call)) {
1400 for (const auto &PE : LastNode->CallerEdges) {
1401 set_subtract(StackSequenceContextIds, PE->getContextIds());
1402 if (StackSequenceContextIds.empty())
1403 break;
1404 }
1405 // If we now have no context ids for clone, skip this call.
1406 if (StackSequenceContextIds.empty())
1407 continue;
1408 }
1409
1410 // Check if the next set of stack ids is the same (since the Calls vector
1411 // of tuples is sorted by the stack ids we can just look at the next one).
1412 bool DuplicateContextIds = false;
1413 if (I + 1 < Calls.size()) {
1414 auto NextIds = std::get<1>(Calls[I + 1]);
1415 DuplicateContextIds = Ids == NextIds;
1416 }
1417
1418 // If we don't have duplicate context ids, then we can assign all the
1419 // context ids computed for the original node sequence to this call.
1420 // If there are duplicate calls with the same stack ids then we synthesize
1421 // new context ids that are duplicates of the originals. These are
1422 // assigned to SavedContextIds, which is a reference into the map entry
1423 // for this call, allowing us to access these ids later on.
1424 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1425 StackSequenceContextIds.size());
1426 SavedContextIds =
1427 DuplicateContextIds
1428 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1429 : StackSequenceContextIds;
1430 assert(!SavedContextIds.empty());
1431
1432 if (!DuplicateContextIds) {
1433 // Update saved last node's context ids to remove those that are
1434 // assigned to other calls, so that it is ready for the next call at
1435 // this stack id.
1436 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1437 if (LastNodeContextIds.empty())
1438 break;
1439 }
1440 }
1441 }
1442
1443 // Propagate the duplicate context ids over the graph.
1444 propagateDuplicateContextIds(OldToNewContextIds);
1445
1446 if (VerifyCCG)
1447 check();
1448
1449 // Now perform a post-order traversal over the graph, starting with the
1450 // allocation nodes, essentially processing nodes from callers to callees.
1451 // For any that contains an id in the map, update the graph to contain new
1452 // nodes representing any inlining at interior callsites. Note we move the
1453 // associated context ids over to the new nodes.
1455 for (auto &Entry : AllocationCallToContextNodeMap)
1456 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1457}
1458
1459uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1461 Call->getMetadata(LLVMContext::MD_callsite));
1462 return CallsiteContext.back();
1463}
1464
1465uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1466 assert(isa<CallsiteInfo *>(Call.getBase()));
1468 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1469 // Need to convert index into stack id.
1470 return Index.getStackIdAtIndex(CallsiteContext.back());
1471}
1472
1473static const std::string MemProfCloneSuffix = ".memprof.";
1474
1475static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1476 // We use CloneNo == 0 to refer to the original version, which doesn't get
1477 // renamed with a suffix.
1478 if (!CloneNo)
1479 return Base.str();
1480 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1481}
1482
1483std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1484 const Instruction *Call,
1485 unsigned CloneNo) const {
1486 return (Twine(Call->getFunction()->getName()) + " -> " +
1487 cast<CallBase>(Call)->getCalledFunction()->getName())
1488 .str();
1489}
1490
1491std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1492 const IndexCall &Call,
1493 unsigned CloneNo) const {
1494 auto VI = FSToVIMap.find(Func);
1495 assert(VI != FSToVIMap.end());
1496 if (isa<AllocInfo *>(Call.getBase()))
1497 return (VI->second.name() + " -> alloc").str();
1498 else {
1499 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1500 return (VI->second.name() + " -> " +
1501 getMemProfFuncName(Callsite->Callee.name(),
1502 Callsite->Clones[CloneNo]))
1503 .str();
1504 }
1505}
1506
1507std::vector<uint64_t>
1508ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1509 Instruction *Call) {
1511 Call->getMetadata(LLVMContext::MD_callsite));
1512 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1513 CallsiteContext);
1514}
1515
1516std::vector<uint64_t>
1517IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1518 assert(isa<CallsiteInfo *>(Call.getBase()));
1520 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1521 return getStackIdsWithContextNodes<CallsiteInfo,
1523 CallsiteContext);
1524}
1525
1526template <typename DerivedCCG, typename FuncTy, typename CallTy>
1527template <class NodeT, class IteratorT>
1528std::vector<uint64_t>
1529CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1530 CallStack<NodeT, IteratorT> &CallsiteContext) {
1531 std::vector<uint64_t> StackIds;
1532 for (auto IdOrIndex : CallsiteContext) {
1533 auto StackId = getStackId(IdOrIndex);
1534 ContextNode *Node = getNodeForStackId(StackId);
1535 if (!Node)
1536 break;
1537 StackIds.push_back(StackId);
1538 }
1539 return StackIds;
1540}
1541
1542ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1543 Module &M,
1545 : Mod(M), OREGetter(OREGetter) {
1546 for (auto &F : M) {
1547 std::vector<CallInfo> CallsWithMetadata;
1548 for (auto &BB : F) {
1549 for (auto &I : BB) {
1550 if (!isa<CallBase>(I))
1551 continue;
1552 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1553 CallsWithMetadata.push_back(&I);
1554 auto *AllocNode = addAllocNode(&I, &F);
1555 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1556 assert(CallsiteMD);
1557 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1558 // Add all of the MIBs and their stack nodes.
1559 for (auto &MDOp : MemProfMD->operands()) {
1560 auto *MIBMD = cast<const MDNode>(MDOp);
1561 MDNode *StackNode = getMIBStackNode(MIBMD);
1562 assert(StackNode);
1563 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1564 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1565 AllocNode, StackContext, CallsiteContext,
1566 getMIBAllocType(MIBMD));
1567 }
1568 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1569 // Memprof and callsite metadata on memory allocations no longer
1570 // needed.
1571 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1572 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1573 }
1574 // For callsite metadata, add to list for this function for later use.
1575 else if (I.getMetadata(LLVMContext::MD_callsite))
1576 CallsWithMetadata.push_back(&I);
1577 }
1578 }
1579 if (!CallsWithMetadata.empty())
1580 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1581 }
1582
1583 if (DumpCCG) {
1584 dbgs() << "CCG before updating call stack chains:\n";
1585 dbgs() << *this;
1586 }
1587
1588 if (ExportToDot)
1589 exportToDot("prestackupdate");
1590
1591 updateStackNodes();
1592
1593 handleCallsitesWithMultipleTargets();
1594
1595 // Strip off remaining callsite metadata, no longer needed.
1596 for (auto &FuncEntry : FuncToCallsWithMetadata)
1597 for (auto &Call : FuncEntry.second)
1598 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1599}
1600
1601IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1604 isPrevailing)
1605 : Index(Index), isPrevailing(isPrevailing) {
1606 for (auto &I : Index) {
1607 auto VI = Index.getValueInfo(I);
1608 for (auto &S : VI.getSummaryList()) {
1609 // We should only add the prevailing nodes. Otherwise we may try to clone
1610 // in a weak copy that won't be linked (and may be different than the
1611 // prevailing version).
1612 // We only keep the memprof summary on the prevailing copy now when
1613 // building the combined index, as a space optimization, however don't
1614 // rely on this optimization. The linker doesn't resolve local linkage
1615 // values so don't check whether those are prevailing.
1616 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1617 !isPrevailing(VI.getGUID(), S.get()))
1618 continue;
1619 auto *FS = dyn_cast<FunctionSummary>(S.get());
1620 if (!FS)
1621 continue;
1622 std::vector<CallInfo> CallsWithMetadata;
1623 if (!FS->allocs().empty()) {
1624 for (auto &AN : FS->mutableAllocs()) {
1625 // This can happen because of recursion elimination handling that
1626 // currently exists in ModuleSummaryAnalysis. Skip these for now.
1627 // We still added them to the summary because we need to be able to
1628 // correlate properly in applyImport in the backends.
1629 if (AN.MIBs.empty())
1630 continue;
1631 CallsWithMetadata.push_back({&AN});
1632 auto *AllocNode = addAllocNode({&AN}, FS);
1633 // Pass an empty CallStack to the CallsiteContext (second)
1634 // parameter, since for ThinLTO we already collapsed out the inlined
1635 // stack ids on the allocation call during ModuleSummaryAnalysis.
1637 EmptyContext;
1638 // Now add all of the MIBs and their stack nodes.
1639 for (auto &MIB : AN.MIBs) {
1641 StackContext(&MIB);
1642 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1643 AllocNode, StackContext, EmptyContext, MIB.AllocType);
1644 }
1645 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1646 // Initialize version 0 on the summary alloc node to the current alloc
1647 // type, unless it has both types in which case make it default, so
1648 // that in the case where we aren't able to clone the original version
1649 // always ends up with the default allocation behavior.
1650 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1651 }
1652 }
1653 // For callsite metadata, add to list for this function for later use.
1654 if (!FS->callsites().empty())
1655 for (auto &SN : FS->mutableCallsites())
1656 CallsWithMetadata.push_back({&SN});
1657
1658 if (!CallsWithMetadata.empty())
1659 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1660
1661 if (!FS->allocs().empty() || !FS->callsites().empty())
1662 FSToVIMap[FS] = VI;
1663 }
1664 }
1665
1666 if (DumpCCG) {
1667 dbgs() << "CCG before updating call stack chains:\n";
1668 dbgs() << *this;
1669 }
1670
1671 if (ExportToDot)
1672 exportToDot("prestackupdate");
1673
1674 updateStackNodes();
1675
1676 handleCallsitesWithMultipleTargets();
1677}
1678
1679template <typename DerivedCCG, typename FuncTy, typename CallTy>
1680void CallsiteContextGraph<DerivedCCG, FuncTy,
1681 CallTy>::handleCallsitesWithMultipleTargets() {
1682 // Look for and workaround callsites that call multiple functions.
1683 // This can happen for indirect calls, which needs better handling, and in
1684 // more rare cases (e.g. macro expansion).
1685 // TODO: To fix this for indirect calls we will want to perform speculative
1686 // devirtualization using either the normal PGO info with ICP, or using the
1687 // information in the profiled MemProf contexts. We can do this prior to
1688 // this transformation for regular LTO, and for ThinLTO we can simulate that
1689 // effect in the summary and perform the actual speculative devirtualization
1690 // while cloning in the ThinLTO backend.
1691
1692 // Keep track of the new nodes synthesized for discovered tail calls missing
1693 // from the profiled contexts.
1694 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1695
1696 for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1697 Entry != NonAllocationCallToContextNodeMap.end();) {
1698 auto *Node = Entry->second;
1699 assert(Node->Clones.empty());
1700 // Check all node callees and see if in the same function.
1701 bool Removed = false;
1702 auto Call = Node->Call.call();
1703 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1704 auto Edge = *EI;
1705 if (!Edge->Callee->hasCall()) {
1706 ++EI;
1707 continue;
1708 }
1709 assert(NodeToCallingFunc.count(Edge->Callee));
1710 // Check if the called function matches that of the callee node.
1711 if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1712 continue;
1713 RemovedEdgesWithMismatchedCallees++;
1714 // Work around by setting Node to have a null call, so it gets
1715 // skipped during cloning. Otherwise assignFunctions will assert
1716 // because its data structures are not designed to handle this case.
1717 Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1718 Node->setCall(CallInfo());
1719 Removed = true;
1720 break;
1721 }
1722 if (!Removed)
1723 Entry++;
1724 }
1725
1726 // Add the new nodes after the above loop so that the iteration is not
1727 // invalidated.
1728 for (auto &[Call, Node] : TailCallToContextNodeMap)
1729 NonAllocationCallToContextNodeMap[Call] = Node;
1730}
1731
1732uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1733 // In the Module (IR) case this is already the Id.
1734 return IdOrIndex;
1735}
1736
1737uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1738 // In the Index case this is an index into the stack id list in the summary
1739 // index, convert it to an Id.
1740 return Index.getStackIdAtIndex(IdOrIndex);
1741}
1742
1743template <typename DerivedCCG, typename FuncTy, typename CallTy>
1744bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1745 CallTy Call, EdgeIter &EI,
1746 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1747 auto Edge = *EI;
1748 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1749 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1750 // Will be populated in order of callee to caller if we find a chain of tail
1751 // calls between the profiled caller and callee.
1752 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1753 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1754 FoundCalleeChain)) {
1755 ++EI;
1756 return false;
1757 }
1758
1759 // The usual case where the profiled callee matches that of the IR/summary.
1760 if (FoundCalleeChain.empty()) {
1761 ++EI;
1762 return true;
1763 }
1764
1765 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1766 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1767 // If there is already an edge between these nodes, simply update it and
1768 // return.
1769 if (CurEdge) {
1770 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1771 Edge->ContextIds.end());
1772 CurEdge->AllocTypes |= Edge->AllocTypes;
1773 return;
1774 }
1775 // Otherwise, create a new edge and insert it into the caller and callee
1776 // lists.
1777 auto NewEdge = std::make_shared<ContextEdge>(
1778 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1779 Callee->CallerEdges.push_back(NewEdge);
1780 if (Caller == Edge->Caller) {
1781 // If we are inserting the new edge into the current edge's caller, insert
1782 // the new edge before the current iterator position, and then increment
1783 // back to the current edge.
1784 EI = Caller->CalleeEdges.insert(EI, NewEdge);
1785 ++EI;
1786 assert(*EI == Edge &&
1787 "Iterator position not restored after insert and increment");
1788 } else
1789 Caller->CalleeEdges.push_back(NewEdge);
1790 };
1791
1792 // Create new nodes for each found callee and connect in between the profiled
1793 // caller and callee.
1794 auto *CurCalleeNode = Edge->Callee;
1795 for (auto &[NewCall, Func] : FoundCalleeChain) {
1796 ContextNode *NewNode = nullptr;
1797 // First check if we have already synthesized a node for this tail call.
1798 if (TailCallToContextNodeMap.count(NewCall)) {
1799 NewNode = TailCallToContextNodeMap[NewCall];
1800 NewNode->ContextIds.insert(Edge->ContextIds.begin(),
1801 Edge->ContextIds.end());
1802 NewNode->AllocTypes |= Edge->AllocTypes;
1803 } else {
1804 FuncToCallsWithMetadata[Func].push_back({NewCall});
1805 // Create Node and record node info.
1806 NodeOwner.push_back(
1807 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1808 NewNode = NodeOwner.back().get();
1809 NodeToCallingFunc[NewNode] = Func;
1810 TailCallToContextNodeMap[NewCall] = NewNode;
1811 NewNode->ContextIds = Edge->ContextIds;
1812 NewNode->AllocTypes = Edge->AllocTypes;
1813 }
1814
1815 // Hook up node to its callee node
1816 AddEdge(NewNode, CurCalleeNode);
1817
1818 CurCalleeNode = NewNode;
1819 }
1820
1821 // Hook up edge's original caller to new callee node.
1822 AddEdge(Edge->Caller, CurCalleeNode);
1823
1824 // Remove old edge
1825 Edge->Callee->eraseCallerEdge(Edge.get());
1826 EI = Edge->Caller->CalleeEdges.erase(EI);
1827
1828 return true;
1829}
1830
1831bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1832 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1833 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1834 bool &FoundMultipleCalleeChains) {
1835 // Stop recursive search if we have already explored the maximum specified
1836 // depth.
1838 return false;
1839
1840 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1841 FoundCalleeChain.push_back({Callsite, F});
1842 };
1843
1844 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
1845 if (!CalleeFunc) {
1846 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
1847 assert(Alias);
1848 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
1849 assert(CalleeFunc);
1850 }
1851
1852 // Look for tail calls in this function, and check if they either call the
1853 // profiled callee directly, or indirectly (via a recursive search).
1854 // Only succeed if there is a single unique tail call chain found between the
1855 // profiled caller and callee, otherwise we could perform incorrect cloning.
1856 bool FoundSingleCalleeChain = false;
1857 for (auto &BB : *CalleeFunc) {
1858 for (auto &I : BB) {
1859 auto *CB = dyn_cast<CallBase>(&I);
1860 if (!CB || !CB->isTailCall())
1861 continue;
1862 auto *CalledValue = CB->getCalledOperand();
1863 auto *CalledFunction = CB->getCalledFunction();
1864 if (CalledValue && !CalledFunction) {
1865 CalledValue = CalledValue->stripPointerCasts();
1866 // Stripping pointer casts can reveal a called function.
1867 CalledFunction = dyn_cast<Function>(CalledValue);
1868 }
1869 // Check if this is an alias to a function. If so, get the
1870 // called aliasee for the checks below.
1871 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
1872 assert(!CalledFunction &&
1873 "Expected null called function in callsite for alias");
1874 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
1875 }
1876 if (!CalledFunction)
1877 continue;
1878 if (CalledFunction == ProfiledCallee) {
1879 if (FoundSingleCalleeChain) {
1880 FoundMultipleCalleeChains = true;
1881 return false;
1882 }
1883 FoundSingleCalleeChain = true;
1884 FoundProfiledCalleeCount++;
1885 FoundProfiledCalleeDepth += Depth;
1886 if (Depth > FoundProfiledCalleeMaxDepth)
1887 FoundProfiledCalleeMaxDepth = Depth;
1888 SaveCallsiteInfo(&I, CalleeFunc);
1889 } else if (findProfiledCalleeThroughTailCalls(
1890 ProfiledCallee, CalledFunction, Depth + 1,
1891 FoundCalleeChain, FoundMultipleCalleeChains)) {
1892 if (FoundMultipleCalleeChains)
1893 return false;
1894 if (FoundSingleCalleeChain) {
1895 FoundMultipleCalleeChains = true;
1896 return false;
1897 }
1898 FoundSingleCalleeChain = true;
1899 SaveCallsiteInfo(&I, CalleeFunc);
1900 }
1901 }
1902 }
1903
1904 return FoundSingleCalleeChain;
1905}
1906
1907bool ModuleCallsiteContextGraph::calleeMatchesFunc(
1908 Instruction *Call, const Function *Func, const Function *CallerFunc,
1909 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
1910 auto *CB = dyn_cast<CallBase>(Call);
1911 if (!CB->getCalledOperand())
1912 return false;
1913 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1914 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
1915 if (CalleeFunc == Func)
1916 return true;
1917 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
1918 if (Alias && Alias->getAliasee() == Func)
1919 return true;
1920
1921 // Recursively search for the profiled callee through tail calls starting with
1922 // the actual Callee. The discovered tail call chain is saved in
1923 // FoundCalleeChain, and we will fixup the graph to include these callsites
1924 // after returning.
1925 // FIXME: We will currently redo the same recursive walk if we find the same
1926 // mismatched callee from another callsite. We can improve this with more
1927 // bookkeeping of the created chain of new nodes for each mismatch.
1928 unsigned Depth = 1;
1929 bool FoundMultipleCalleeChains = false;
1930 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
1931 FoundCalleeChain,
1932 FoundMultipleCalleeChains)) {
1933 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
1934 << Func->getName() << " from " << CallerFunc->getName()
1935 << " that actually called " << CalleeVal->getName()
1936 << (FoundMultipleCalleeChains
1937 ? " (found multiple possible chains)"
1938 : "")
1939 << "\n");
1940 if (FoundMultipleCalleeChains)
1941 FoundProfiledCalleeNonUniquelyCount++;
1942 return false;
1943 }
1944
1945 return true;
1946}
1947
1948bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1949 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1950 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1951 bool &FoundMultipleCalleeChains) {
1952 // Stop recursive search if we have already explored the maximum specified
1953 // depth.
1955 return false;
1956
1957 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
1958 // Make a CallsiteInfo for each discovered callee, if one hasn't already
1959 // been synthesized.
1960 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
1961 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
1962 // StackIds is empty (we don't have debug info available in the index for
1963 // these callsites)
1964 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
1965 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
1966 CallsiteInfo *NewCallsiteInfo =
1967 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
1968 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
1969 };
1970
1971 // Look for tail calls in this function, and check if they either call the
1972 // profiled callee directly, or indirectly (via a recursive search).
1973 // Only succeed if there is a single unique tail call chain found between the
1974 // profiled caller and callee, otherwise we could perform incorrect cloning.
1975 bool FoundSingleCalleeChain = false;
1976 for (auto &S : CurCallee.getSummaryList()) {
1977 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1978 !isPrevailing(CurCallee.getGUID(), S.get()))
1979 continue;
1980 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
1981 if (!FS)
1982 continue;
1983 auto FSVI = CurCallee;
1984 auto *AS = dyn_cast<AliasSummary>(S.get());
1985 if (AS)
1986 FSVI = AS->getAliaseeVI();
1987 for (auto &CallEdge : FS->calls()) {
1988 if (!CallEdge.second.hasTailCall())
1989 continue;
1990 if (CallEdge.first == ProfiledCallee) {
1991 if (FoundSingleCalleeChain) {
1992 FoundMultipleCalleeChains = true;
1993 return false;
1994 }
1995 FoundSingleCalleeChain = true;
1996 FoundProfiledCalleeCount++;
1997 FoundProfiledCalleeDepth += Depth;
1998 if (Depth > FoundProfiledCalleeMaxDepth)
1999 FoundProfiledCalleeMaxDepth = Depth;
2000 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2001 // Add FS to FSToVIMap in case it isn't already there.
2002 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2003 FSToVIMap[FS] = FSVI;
2004 } else if (findProfiledCalleeThroughTailCalls(
2005 ProfiledCallee, CallEdge.first, Depth + 1,
2006 FoundCalleeChain, FoundMultipleCalleeChains)) {
2007 if (FoundMultipleCalleeChains)
2008 return false;
2009 if (FoundSingleCalleeChain) {
2010 FoundMultipleCalleeChains = true;
2011 return false;
2012 }
2013 FoundSingleCalleeChain = true;
2014 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2015 // Add FS to FSToVIMap in case it isn't already there.
2016 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2017 FSToVIMap[FS] = FSVI;
2018 }
2019 }
2020 }
2021
2022 return FoundSingleCalleeChain;
2023}
2024
2025bool IndexCallsiteContextGraph::calleeMatchesFunc(
2026 IndexCall &Call, const FunctionSummary *Func,
2027 const FunctionSummary *CallerFunc,
2028 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2030 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2031 // If there is no summary list then this is a call to an externally defined
2032 // symbol.
2033 AliasSummary *Alias =
2034 Callee.getSummaryList().empty()
2035 ? nullptr
2036 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2037 assert(FSToVIMap.count(Func));
2038 auto FuncVI = FSToVIMap[Func];
2039 if (Callee == FuncVI ||
2040 // If callee is an alias, check the aliasee, since only function
2041 // summary base objects will contain the stack node summaries and thus
2042 // get a context node.
2043 (Alias && Alias->getAliaseeVI() == FuncVI))
2044 return true;
2045
2046 // Recursively search for the profiled callee through tail calls starting with
2047 // the actual Callee. The discovered tail call chain is saved in
2048 // FoundCalleeChain, and we will fixup the graph to include these callsites
2049 // after returning.
2050 // FIXME: We will currently redo the same recursive walk if we find the same
2051 // mismatched callee from another callsite. We can improve this with more
2052 // bookkeeping of the created chain of new nodes for each mismatch.
2053 unsigned Depth = 1;
2054 bool FoundMultipleCalleeChains = false;
2055 if (!findProfiledCalleeThroughTailCalls(
2056 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2057 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2058 << " from " << FSToVIMap[CallerFunc]
2059 << " that actually called " << Callee
2060 << (FoundMultipleCalleeChains
2061 ? " (found multiple possible chains)"
2062 : "")
2063 << "\n");
2064 if (FoundMultipleCalleeChains)
2065 FoundProfiledCalleeNonUniquelyCount++;
2066 return false;
2067 }
2068
2069 return true;
2070}
2071
2072static std::string getAllocTypeString(uint8_t AllocTypes) {
2073 if (!AllocTypes)
2074 return "None";
2075 std::string Str;
2076 if (AllocTypes & (uint8_t)AllocationType::NotCold)
2077 Str += "NotCold";
2078 if (AllocTypes & (uint8_t)AllocationType::Cold)
2079 Str += "Cold";
2080 return Str;
2081}
2082
2083template <typename DerivedCCG, typename FuncTy, typename CallTy>
2084void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2085 const {
2086 print(dbgs());
2087 dbgs() << "\n";
2088}
2089
2090template <typename DerivedCCG, typename FuncTy, typename CallTy>
2091void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2092 raw_ostream &OS) const {
2093 OS << "Node " << this << "\n";
2094 OS << "\t";
2095 printCall(OS);
2096 if (Recursive)
2097 OS << " (recursive)";
2098 OS << "\n";
2099 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2100 OS << "\tContextIds:";
2101 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2102 std::sort(SortedIds.begin(), SortedIds.end());
2103 for (auto Id : SortedIds)
2104 OS << " " << Id;
2105 OS << "\n";
2106 OS << "\tCalleeEdges:\n";
2107 for (auto &Edge : CalleeEdges)
2108 OS << "\t\t" << *Edge << "\n";
2109 OS << "\tCallerEdges:\n";
2110 for (auto &Edge : CallerEdges)
2111 OS << "\t\t" << *Edge << "\n";
2112 if (!Clones.empty()) {
2113 OS << "\tClones: ";
2114 FieldSeparator FS;
2115 for (auto *Clone : Clones)
2116 OS << FS << Clone;
2117 OS << "\n";
2118 } else if (CloneOf) {
2119 OS << "\tClone of " << CloneOf << "\n";
2120 }
2121}
2122
2123template <typename DerivedCCG, typename FuncTy, typename CallTy>
2124void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2125 const {
2126 print(dbgs());
2127 dbgs() << "\n";
2128}
2129
2130template <typename DerivedCCG, typename FuncTy, typename CallTy>
2131void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2132 raw_ostream &OS) const {
2133 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2134 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2135 OS << " ContextIds:";
2136 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2137 std::sort(SortedIds.begin(), SortedIds.end());
2138 for (auto Id : SortedIds)
2139 OS << " " << Id;
2140}
2141
2142template <typename DerivedCCG, typename FuncTy, typename CallTy>
2143void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2144 print(dbgs());
2145}
2146
2147template <typename DerivedCCG, typename FuncTy, typename CallTy>
2148void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2149 raw_ostream &OS) const {
2150 OS << "Callsite Context Graph:\n";
2151 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2152 for (const auto Node : nodes<GraphType>(this)) {
2153 if (Node->isRemoved())
2154 continue;
2155 Node->print(OS);
2156 OS << "\n";
2157 }
2158}
2159
2160template <typename DerivedCCG, typename FuncTy, typename CallTy>
2161static void checkEdge(
2162 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
2163 // Confirm that alloc type is not None and that we have at least one context
2164 // id.
2165 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
2166 assert(!Edge->ContextIds.empty());
2167}
2168
2169template <typename DerivedCCG, typename FuncTy, typename CallTy>
2170static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
2171 bool CheckEdges = true) {
2172 if (Node->isRemoved())
2173 return;
2174 // Node's context ids should be the union of both its callee and caller edge
2175 // context ids.
2176 if (Node->CallerEdges.size()) {
2177 auto EI = Node->CallerEdges.begin();
2178 auto &FirstEdge = *EI;
2179 EI++;
2180 DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
2181 for (; EI != Node->CallerEdges.end(); EI++) {
2182 const auto &Edge = *EI;
2183 if (CheckEdges)
2184 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2185 set_union(CallerEdgeContextIds, Edge->ContextIds);
2186 }
2187 // Node can have more context ids than callers if some contexts terminate at
2188 // node and some are longer.
2189 assert(Node->ContextIds == CallerEdgeContextIds ||
2190 set_is_subset(CallerEdgeContextIds, Node->ContextIds));
2191 }
2192 if (Node->CalleeEdges.size()) {
2193 auto EI = Node->CalleeEdges.begin();
2194 auto &FirstEdge = *EI;
2195 EI++;
2196 DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
2197 for (; EI != Node->CalleeEdges.end(); EI++) {
2198 const auto &Edge = *EI;
2199 if (CheckEdges)
2200 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2201 set_union(CalleeEdgeContextIds, Edge->ContextIds);
2202 }
2203 assert(Node->ContextIds == CalleeEdgeContextIds);
2204 }
2205}
2206
2207template <typename DerivedCCG, typename FuncTy, typename CallTy>
2208void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2209 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2210 for (const auto Node : nodes<GraphType>(this)) {
2211 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2212 for (auto &Edge : Node->CallerEdges)
2213 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2214 }
2215}
2216
2217template <typename DerivedCCG, typename FuncTy, typename CallTy>
2218struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2219 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2220 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2221
2222 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2223 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2224
2227 decltype(&getNode)>;
2228
2230 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2231 }
2232
2234 return nodes_iterator(G->NodeOwner.end(), &getNode);
2235 }
2236
2238 return G->NodeOwner.begin()->get();
2239 }
2240
2241 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2242 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2244 return P->Callee;
2245 }
2246
2248 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2249 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2250 decltype(&GetCallee)>;
2251
2253 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2254 }
2255
2257 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2258 }
2259};
2260
2261template <typename DerivedCCG, typename FuncTy, typename CallTy>
2262struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2263 : public DefaultDOTGraphTraits {
2264 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2265
2266 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2268 using NodeRef = typename GTraits::NodeRef;
2269 using ChildIteratorType = typename GTraits::ChildIteratorType;
2270
2271 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2272 std::string LabelString =
2273 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2274 Twine(Node->OrigStackOrAllocId))
2275 .str();
2276 LabelString += "\n";
2277 if (Node->hasCall()) {
2278 auto Func = G->NodeToCallingFunc.find(Node);
2279 assert(Func != G->NodeToCallingFunc.end());
2280 LabelString +=
2281 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2282 } else {
2283 LabelString += "null call";
2284 if (Node->Recursive)
2285 LabelString += " (recursive)";
2286 else
2287 LabelString += " (external)";
2288 }
2289 return LabelString;
2290 }
2291
2292 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2293 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2294 getContextIds(Node->ContextIds) + "\"")
2295 .str();
2297 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2298 AttributeString += ",style=\"filled\"";
2299 if (Node->CloneOf) {
2300 AttributeString += ",color=\"blue\"";
2301 AttributeString += ",style=\"filled,bold,dashed\"";
2302 } else
2303 AttributeString += ",style=\"filled\"";
2304 return AttributeString;
2305 }
2306
2307 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2308 GraphType) {
2309 auto &Edge = *(ChildIter.getCurrent());
2310 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2311 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2312 .str();
2313 }
2314
2315 // Since the NodeOwners list includes nodes that are no longer connected to
2316 // the graph, skip them here.
2317 static bool isNodeHidden(NodeRef Node, GraphType) {
2318 return Node->isRemoved();
2319 }
2320
2321private:
2322 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2323 std::string IdString = "ContextIds:";
2324 if (ContextIds.size() < 100) {
2325 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2326 std::sort(SortedIds.begin(), SortedIds.end());
2327 for (auto Id : SortedIds)
2328 IdString += (" " + Twine(Id)).str();
2329 } else {
2330 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2331 }
2332 return IdString;
2333 }
2334
2335 static std::string getColor(uint8_t AllocTypes) {
2336 if (AllocTypes == (uint8_t)AllocationType::NotCold)
2337 // Color "brown1" actually looks like a lighter red.
2338 return "brown1";
2339 if (AllocTypes == (uint8_t)AllocationType::Cold)
2340 return "cyan";
2341 if (AllocTypes ==
2342 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2343 // Lighter purple.
2344 return "mediumorchid1";
2345 return "gray";
2346 }
2347
2348 static std::string getNodeId(NodeRef Node) {
2349 std::stringstream SStream;
2350 SStream << std::hex << "N0x" << (unsigned long long)Node;
2351 std::string Result = SStream.str();
2352 return Result;
2353 }
2354};
2355
2356template <typename DerivedCCG, typename FuncTy, typename CallTy>
2357void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2358 std::string Label) const {
2359 WriteGraph(this, "", false, Label,
2360 DotFilePathPrefix + "ccg." + Label + ".dot");
2361}
2362
2363template <typename DerivedCCG, typename FuncTy, typename CallTy>
2364typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
2365CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2366 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
2367 DenseSet<uint32_t> ContextIdsToMove) {
2368 ContextNode *Node = Edge->Callee;
2369 NodeOwner.push_back(
2370 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2371 ContextNode *Clone = NodeOwner.back().get();
2372 Node->addClone(Clone);
2373 assert(NodeToCallingFunc.count(Node));
2374 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2375 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
2376 ContextIdsToMove);
2377 return Clone;
2378}
2379
2380template <typename DerivedCCG, typename FuncTy, typename CallTy>
2381void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2382 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2383 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2384 bool NewClone,
2385 DenseSet<uint32_t> ContextIdsToMove) {
2386 // NewCallee and Edge's current callee must be clones of the same original
2387 // node (Edge's current callee may be the original node too).
2388 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2389
2390 ContextNode *OldCallee = Edge->Callee;
2391
2392 // We might already have an edge to the new callee from earlier cloning for a
2393 // different allocation. If one exists we will reuse it.
2394 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
2395
2396 // Callers will pass an empty ContextIdsToMove set when they want to move the
2397 // edge. Copy in Edge's ids for simplicity.
2398 if (ContextIdsToMove.empty())
2399 ContextIdsToMove = Edge->getContextIds();
2400
2401 // If we are moving all of Edge's ids, then just move the whole Edge.
2402 // Otherwise only move the specified subset, to a new edge if needed.
2403 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
2404 // Moving the whole Edge.
2405 if (CallerEdgeI)
2406 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2407 else
2408 OldCallee->eraseCallerEdge(Edge.get());
2409 if (ExistingEdgeToNewCallee) {
2410 // Since we already have an edge to NewCallee, simply move the ids
2411 // onto it, and remove the existing Edge.
2412 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2413 ContextIdsToMove.end());
2414 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
2415 assert(Edge->ContextIds == ContextIdsToMove);
2416 Edge->ContextIds.clear();
2417 Edge->AllocTypes = (uint8_t)AllocationType::None;
2418 Edge->Caller->eraseCalleeEdge(Edge.get());
2419 } else {
2420 // Otherwise just reconnect Edge to NewCallee.
2421 Edge->Callee = NewCallee;
2422 NewCallee->CallerEdges.push_back(Edge);
2423 // Don't need to update Edge's context ids since we are simply
2424 // reconnecting it.
2425 }
2426 // In either case, need to update the alloc types on New Callee.
2427 NewCallee->AllocTypes |= Edge->AllocTypes;
2428 } else {
2429 // Only moving a subset of Edge's ids.
2430 if (CallerEdgeI)
2431 ++CallerEdgeI;
2432 // Compute the alloc type of the subset of ids being moved.
2433 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
2434 if (ExistingEdgeToNewCallee) {
2435 // Since we already have an edge to NewCallee, simply move the ids
2436 // onto it.
2437 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2438 ContextIdsToMove.end());
2439 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
2440 } else {
2441 // Otherwise, create a new edge to NewCallee for the ids being moved.
2442 auto NewEdge = std::make_shared<ContextEdge>(
2443 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
2444 Edge->Caller->CalleeEdges.push_back(NewEdge);
2445 NewCallee->CallerEdges.push_back(NewEdge);
2446 }
2447 // In either case, need to update the alloc types on NewCallee, and remove
2448 // those ids and update the alloc type on the original Edge.
2449 NewCallee->AllocTypes |= CallerEdgeAllocType;
2450 set_subtract(Edge->ContextIds, ContextIdsToMove);
2451 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
2452 }
2453 // Now perform some updates that are common to all cases: the NewCallee gets
2454 // the moved ids added, and we need to remove those ids from OldCallee and
2455 // update its alloc type (NewCallee alloc type updates handled above).
2456 NewCallee->ContextIds.insert(ContextIdsToMove.begin(),
2457 ContextIdsToMove.end());
2458 set_subtract(OldCallee->ContextIds, ContextIdsToMove);
2459 OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
2460 // OldCallee alloc type should be None iff its context id set is now empty.
2461 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2462 OldCallee->ContextIds.empty());
2463 // Now walk the old callee node's callee edges and move Edge's context ids
2464 // over to the corresponding edge into the clone (which is created here if
2465 // this is a newly created clone).
2466 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2467 // The context ids moving to the new callee are the subset of this edge's
2468 // context ids and the context ids on the caller edge being moved.
2469 DenseSet<uint32_t> EdgeContextIdsToMove =
2470 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
2471 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2472 OldCalleeEdge->AllocTypes =
2473 computeAllocType(OldCalleeEdge->getContextIds());
2474 if (!NewClone) {
2475 // Update context ids / alloc type on corresponding edge to NewCallee.
2476 // There is a chance this may not exist if we are reusing an existing
2477 // clone, specifically during function assignment, where we would have
2478 // removed none type edges after creating the clone. If we can't find
2479 // a corresponding edge there, fall through to the cloning below.
2480 if (auto *NewCalleeEdge =
2481 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2482 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2483 EdgeContextIdsToMove.end());
2484 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2485 continue;
2486 }
2487 }
2488 auto NewEdge = std::make_shared<ContextEdge>(
2489 OldCalleeEdge->Callee, NewCallee,
2490 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2491 NewCallee->CalleeEdges.push_back(NewEdge);
2492 NewEdge->Callee->CallerEdges.push_back(NewEdge);
2493 }
2494 if (VerifyCCG) {
2495 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2496 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2497 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2498 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2499 /*CheckEdges=*/false);
2500 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2501 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2502 /*CheckEdges=*/false);
2503 }
2504}
2505
2506template <typename DerivedCCG, typename FuncTy, typename CallTy>
2507void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2508 recursivelyRemoveNoneTypeCalleeEdges(
2509 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2510 auto Inserted = Visited.insert(Node);
2511 if (!Inserted.second)
2512 return;
2513
2514 removeNoneTypeCalleeEdges(Node);
2515
2516 for (auto *Clone : Node->Clones)
2517 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
2518
2519 // The recursive call may remove some of this Node's caller edges.
2520 // Iterate over a copy and skip any that were removed.
2521 auto CallerEdges = Node->CallerEdges;
2522 for (auto &Edge : CallerEdges) {
2523 // Skip any that have been removed by an earlier recursive call.
2524 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2525 assert(!std::count(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2526 Edge));
2527 continue;
2528 }
2529 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
2530 }
2531}
2532
2533template <typename DerivedCCG, typename FuncTy, typename CallTy>
2534void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2536 for (auto &Entry : AllocationCallToContextNodeMap) {
2537 Visited.clear();
2538 identifyClones(Entry.second, Visited, Entry.second->ContextIds);
2539 }
2540 Visited.clear();
2541 for (auto &Entry : AllocationCallToContextNodeMap)
2542 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
2543 if (VerifyCCG)
2544 check();
2545}
2546
2547// helper function to check an AllocType is cold or notcold or both.
2549 return (AllocType == (uint8_t)AllocationType::Cold) ||
2550 (AllocType == (uint8_t)AllocationType::NotCold) ||
2551 (AllocType ==
2552 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2553}
2554
2555template <typename DerivedCCG, typename FuncTy, typename CallTy>
2556void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2557 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
2558 const DenseSet<uint32_t> &AllocContextIds) {
2559 if (VerifyNodes)
2560 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2561 assert(!Node->CloneOf);
2562
2563 // If Node as a null call, then either it wasn't found in the module (regular
2564 // LTO) or summary index (ThinLTO), or there were other conditions blocking
2565 // cloning (e.g. recursion, calls multiple targets, etc).
2566 // Do this here so that we don't try to recursively clone callers below, which
2567 // isn't useful at least for this node.
2568 if (!Node->hasCall())
2569 return;
2570
2571#ifndef NDEBUG
2572 auto Insert =
2573#endif
2574 Visited.insert(Node);
2575 // We should not have visited this node yet.
2576 assert(Insert.second);
2577 // The recursive call to identifyClones may delete the current edge from the
2578 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2579 // in an iterator and having recursive call erase from it. Other edges may
2580 // also get removed during the recursion, which will have null Callee and
2581 // Caller pointers (and are deleted later), so we skip those below.
2582 {
2583 auto CallerEdges = Node->CallerEdges;
2584 for (auto &Edge : CallerEdges) {
2585 // Skip any that have been removed by an earlier recursive call.
2586 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2587 assert(!llvm::count(Node->CallerEdges, Edge));
2588 continue;
2589 }
2590 // Ignore any caller we previously visited via another edge.
2591 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2592 identifyClones(Edge->Caller, Visited, AllocContextIds);
2593 }
2594 }
2595 }
2596
2597 // Check if we reached an unambiguous call or have have only a single caller.
2598 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2599 return;
2600
2601 // We need to clone.
2602
2603 // Try to keep the original version as alloc type NotCold. This will make
2604 // cases with indirect calls or any other situation with an unknown call to
2605 // the original function get the default behavior. We do this by sorting the
2606 // CallerEdges of the Node we will clone by alloc type.
2607 //
2608 // Give NotCold edge the lowest sort priority so those edges are at the end of
2609 // the caller edges vector, and stay on the original version (since the below
2610 // code clones greedily until it finds all remaining edges have the same type
2611 // and leaves the remaining ones on the original Node).
2612 //
2613 // We shouldn't actually have any None type edges, so the sorting priority for
2614 // that is arbitrary, and we assert in that case below.
2615 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2616 /*Cold*/ 1,
2617 /*NotColdCold*/ 2};
2618 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2619 [&](const std::shared_ptr<ContextEdge> &A,
2620 const std::shared_ptr<ContextEdge> &B) {
2621 // Nodes with non-empty context ids should be sorted before
2622 // those with empty context ids.
2623 if (A->ContextIds.empty())
2624 // Either B ContextIds are non-empty (in which case we
2625 // should return false because B < A), or B ContextIds
2626 // are empty, in which case they are equal, and we should
2627 // maintain the original relative ordering.
2628 return false;
2629 if (B->ContextIds.empty())
2630 return true;
2631
2632 if (A->AllocTypes == B->AllocTypes)
2633 // Use the first context id for each edge as a
2634 // tie-breaker.
2635 return *A->ContextIds.begin() < *B->ContextIds.begin();
2636 return AllocTypeCloningPriority[A->AllocTypes] <
2637 AllocTypeCloningPriority[B->AllocTypes];
2638 });
2639
2640 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2641
2642 // Iterate until we find no more opportunities for disambiguating the alloc
2643 // types via cloning. In most cases this loop will terminate once the Node
2644 // has a single allocation type, in which case no more cloning is needed.
2645 // We need to be able to remove Edge from CallerEdges, so need to adjust
2646 // iterator inside the loop.
2647 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2648 auto CallerEdge = *EI;
2649
2650 // See if cloning the prior caller edge left this node with a single alloc
2651 // type or a single caller. In that case no more cloning of Node is needed.
2652 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2653 break;
2654
2655 // Only need to process the ids along this edge pertaining to the given
2656 // allocation.
2657 auto CallerEdgeContextsForAlloc =
2658 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
2659 if (CallerEdgeContextsForAlloc.empty()) {
2660 ++EI;
2661 continue;
2662 }
2663 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
2664
2665 // Compute the node callee edge alloc types corresponding to the context ids
2666 // for this caller edge.
2667 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2668 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2669 for (auto &CalleeEdge : Node->CalleeEdges)
2670 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2671 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
2672
2673 // Don't clone if doing so will not disambiguate any alloc types amongst
2674 // caller edges (including the callee edges that would be cloned).
2675 // Otherwise we will simply move all edges to the clone.
2676 //
2677 // First check if by cloning we will disambiguate the caller allocation
2678 // type from node's allocation type. Query allocTypeToUse so that we don't
2679 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2680 // neither of these should be None type.
2681 //
2682 // Then check if by cloning node at least one of the callee edges will be
2683 // disambiguated by splitting out different context ids.
2684 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2685 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2686 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
2687 allocTypeToUse(Node->AllocTypes) &&
2688 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2689 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2690 ++EI;
2691 continue;
2692 }
2693
2694 // First see if we can use an existing clone. Check each clone and its
2695 // callee edges for matching alloc types.
2696 ContextNode *Clone = nullptr;
2697 for (auto *CurClone : Node->Clones) {
2698 if (allocTypeToUse(CurClone->AllocTypes) !=
2699 allocTypeToUse(CallerAllocTypeForAlloc))
2700 continue;
2701
2702 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2703 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2704 continue;
2705 Clone = CurClone;
2706 break;
2707 }
2708
2709 // The edge iterator is adjusted when we move the CallerEdge to the clone.
2710 if (Clone)
2711 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,
2712 CallerEdgeContextsForAlloc);
2713 else
2714 Clone =
2715 moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);
2716
2717 assert(EI == Node->CallerEdges.end() ||
2718 Node->AllocTypes != (uint8_t)AllocationType::None);
2719 // Sanity check that no alloc types on clone or its edges are None.
2720 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2721 }
2722
2723 // We should still have some context ids on the original Node.
2724 assert(!Node->ContextIds.empty());
2725
2726 // Sanity check that no alloc types on node or edges are None.
2727 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2728
2729 if (VerifyNodes)
2730 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2731}
2732
2733void ModuleCallsiteContextGraph::updateAllocationCall(
2735 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2736 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2737 "memprof", AllocTypeString);
2738 cast<CallBase>(Call.call())->addFnAttr(A);
2739 OREGetter(Call.call()->getFunction())
2740 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2741 << ore::NV("AllocationCall", Call.call()) << " in clone "
2742 << ore::NV("Caller", Call.call()->getFunction())
2743 << " marked with memprof allocation attribute "
2744 << ore::NV("Attribute", AllocTypeString));
2745}
2746
2747void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2749 auto *AI = Call.call().dyn_cast<AllocInfo *>();
2750 assert(AI);
2751 assert(AI->Versions.size() > Call.cloneNo());
2752 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2753}
2754
2755void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2756 FuncInfo CalleeFunc) {
2757 if (CalleeFunc.cloneNo() > 0)
2758 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2759 OREGetter(CallerCall.call()->getFunction())
2760 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2761 << ore::NV("Call", CallerCall.call()) << " in clone "
2762 << ore::NV("Caller", CallerCall.call()->getFunction())
2763 << " assigned to call function clone "
2764 << ore::NV("Callee", CalleeFunc.func()));
2765}
2766
2767void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2768 FuncInfo CalleeFunc) {
2769 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2770 assert(CI &&
2771 "Caller cannot be an allocation which should not have profiled calls");
2772 assert(CI->Clones.size() > CallerCall.cloneNo());
2773 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2774}
2775
2776CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2777 Instruction *>::FuncInfo
2778ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2779 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2780 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2781 // Use existing LLVM facilities for cloning and obtaining Call in clone
2782 ValueToValueMapTy VMap;
2783 auto *NewFunc = CloneFunction(Func.func(), VMap);
2784 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2785 assert(!Func.func()->getParent()->getFunction(Name));
2786 NewFunc->setName(Name);
2787 for (auto &Inst : CallsWithMetadataInFunc) {
2788 // This map always has the initial version in it.
2789 assert(Inst.cloneNo() == 0);
2790 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2791 }
2792 OREGetter(Func.func())
2793 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2794 << "created clone " << ore::NV("NewFunction", NewFunc));
2795 return {NewFunc, CloneNo};
2796}
2797
2798CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2799 IndexCall>::FuncInfo
2800IndexCallsiteContextGraph::cloneFunctionForCallsite(
2801 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2802 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2803 // Check how many clones we have of Call (and therefore function).
2804 // The next clone number is the current size of versions array.
2805 // Confirm this matches the CloneNo provided by the caller, which is based on
2806 // the number of function clones we have.
2807 assert(CloneNo ==
2808 (Call.call().is<AllocInfo *>()
2809 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2810 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2811 // Walk all the instructions in this function. Create a new version for
2812 // each (by adding an entry to the Versions/Clones summary array), and copy
2813 // over the version being called for the function clone being cloned here.
2814 // Additionally, add an entry to the CallMap for the new function clone,
2815 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2816 // to the new call clone.
2817 for (auto &Inst : CallsWithMetadataInFunc) {
2818 // This map always has the initial version in it.
2819 assert(Inst.cloneNo() == 0);
2820 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2821 assert(AI->Versions.size() == CloneNo);
2822 // We assign the allocation type later (in updateAllocationCall), just add
2823 // an entry for it here.
2824 AI->Versions.push_back(0);
2825 } else {
2826 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2827 assert(CI && CI->Clones.size() == CloneNo);
2828 // We assign the clone number later (in updateCall), just add an entry for
2829 // it here.
2830 CI->Clones.push_back(0);
2831 }
2832 CallMap[Inst] = {Inst.call(), CloneNo};
2833 }
2834 return {Func.func(), CloneNo};
2835}
2836
2837// This method assigns cloned callsites to functions, cloning the functions as
2838// needed. The assignment is greedy and proceeds roughly as follows:
2839//
2840// For each function Func:
2841// For each call with graph Node having clones:
2842// Initialize ClonesWorklist to Node and its clones
2843// Initialize NodeCloneCount to 0
2844// While ClonesWorklist is not empty:
2845// Clone = pop front ClonesWorklist
2846// NodeCloneCount++
2847// If Func has been cloned less than NodeCloneCount times:
2848// If NodeCloneCount is 1:
2849// Assign Clone to original Func
2850// Continue
2851// Create a new function clone
2852// If other callers not assigned to call a function clone yet:
2853// Assign them to call new function clone
2854// Continue
2855// Assign any other caller calling the cloned version to new clone
2856//
2857// For each caller of Clone:
2858// If caller is assigned to call a specific function clone:
2859// If we cannot assign Clone to that function clone:
2860// Create new callsite Clone NewClone
2861// Add NewClone to ClonesWorklist
2862// Continue
2863// Assign Clone to existing caller's called function clone
2864// Else:
2865// If Clone not already assigned to a function clone:
2866// Assign to first function clone without assignment
2867// Assign caller to selected function clone
2868template <typename DerivedCCG, typename FuncTy, typename CallTy>
2869bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2870 bool Changed = false;
2871
2872 // Keep track of the assignment of nodes (callsites) to function clones they
2873 // call.
2874 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2875
2876 // Update caller node to call function version CalleeFunc, by recording the
2877 // assignment in CallsiteToCalleeFuncCloneMap.
2878 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2879 const FuncInfo &CalleeFunc) {
2880 assert(Caller->hasCall());
2881 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2882 };
2883
2884 // Walk all functions for which we saw calls with memprof metadata, and handle
2885 // cloning for each of its calls.
2886 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2887 FuncInfo OrigFunc(Func);
2888 // Map from each clone of OrigFunc to a map of remappings of each call of
2889 // interest (from original uncloned call to the corresponding cloned call in
2890 // that function clone).
2891 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2892 for (auto &Call : CallsWithMetadata) {
2893 ContextNode *Node = getNodeForInst(Call);
2894 // Skip call if we do not have a node for it (all uses of its stack ids
2895 // were either on inlined chains or pruned from the MIBs), or if we did
2896 // not create any clones for it.
2897 if (!Node || Node->Clones.empty())
2898 continue;
2899 assert(Node->hasCall() &&
2900 "Not having a call should have prevented cloning");
2901
2902 // Track the assignment of function clones to clones of the current
2903 // callsite Node being handled.
2904 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2905
2906 // Assign callsite version CallsiteClone to function version FuncClone,
2907 // and also assign (possibly cloned) Call to CallsiteClone.
2908 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2909 CallInfo &Call,
2910 ContextNode *CallsiteClone,
2911 bool IsAlloc) {
2912 // Record the clone of callsite node assigned to this function clone.
2913 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2914
2915 assert(FuncClonesToCallMap.count(FuncClone));
2916 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2917 CallInfo CallClone(Call);
2918 if (CallMap.count(Call))
2919 CallClone = CallMap[Call];
2920 CallsiteClone->setCall(CallClone);
2921 };
2922
2923 // Keep track of the clones of callsite Node that need to be assigned to
2924 // function clones. This list may be expanded in the loop body below if we
2925 // find additional cloning is required.
2926 std::deque<ContextNode *> ClonesWorklist;
2927 // Ignore original Node if we moved all of its contexts to clones.
2928 if (!Node->ContextIds.empty())
2929 ClonesWorklist.push_back(Node);
2930 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2931 Node->Clones.end());
2932
2933 // Now walk through all of the clones of this callsite Node that we need,
2934 // and determine the assignment to a corresponding clone of the current
2935 // function (creating new function clones as needed).
2936 unsigned NodeCloneCount = 0;
2937 while (!ClonesWorklist.empty()) {
2938 ContextNode *Clone = ClonesWorklist.front();
2939 ClonesWorklist.pop_front();
2940 NodeCloneCount++;
2941 if (VerifyNodes)
2942 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2943
2944 // Need to create a new function clone if we have more callsite clones
2945 // than existing function clones, which would have been assigned to an
2946 // earlier clone in the list (we assign callsite clones to function
2947 // clones greedily).
2948 if (FuncClonesToCallMap.size() < NodeCloneCount) {
2949 // If this is the first callsite copy, assign to original function.
2950 if (NodeCloneCount == 1) {
2951 // Since FuncClonesToCallMap is empty in this case, no clones have
2952 // been created for this function yet, and no callers should have
2953 // been assigned a function clone for this callee node yet.
2955 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2956 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2957 }));
2958 // Initialize with empty call map, assign Clone to original function
2959 // and its callers, and skip to the next clone.
2960 FuncClonesToCallMap[OrigFunc] = {};
2961 AssignCallsiteCloneToFuncClone(
2962 OrigFunc, Call, Clone,
2963 AllocationCallToContextNodeMap.count(Call));
2964 for (auto &CE : Clone->CallerEdges) {
2965 // Ignore any caller that does not have a recorded callsite Call.
2966 if (!CE->Caller->hasCall())
2967 continue;
2968 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2969 }
2970 continue;
2971 }
2972
2973 // First locate which copy of OrigFunc to clone again. If a caller
2974 // of this callsite clone was already assigned to call a particular
2975 // function clone, we need to redirect all of those callers to the
2976 // new function clone, and update their other callees within this
2977 // function.
2978 FuncInfo PreviousAssignedFuncClone;
2979 auto EI = llvm::find_if(
2980 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2981 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2982 });
2983 bool CallerAssignedToCloneOfFunc = false;
2984 if (EI != Clone->CallerEdges.end()) {
2985 const std::shared_ptr<ContextEdge> &Edge = *EI;
2986 PreviousAssignedFuncClone =
2987 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2988 CallerAssignedToCloneOfFunc = true;
2989 }
2990
2991 // Clone function and save it along with the CallInfo map created
2992 // during cloning in the FuncClonesToCallMap.
2993 std::map<CallInfo, CallInfo> NewCallMap;
2994 unsigned CloneNo = FuncClonesToCallMap.size();
2995 assert(CloneNo > 0 && "Clone 0 is the original function, which "
2996 "should already exist in the map");
2997 FuncInfo NewFuncClone = cloneFunctionForCallsite(
2998 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
2999 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3000 FunctionClonesAnalysis++;
3001 Changed = true;
3002
3003 // If no caller callsites were already assigned to a clone of this
3004 // function, we can simply assign this clone to the new func clone
3005 // and update all callers to it, then skip to the next clone.
3006 if (!CallerAssignedToCloneOfFunc) {
3007 AssignCallsiteCloneToFuncClone(
3008 NewFuncClone, Call, Clone,
3009 AllocationCallToContextNodeMap.count(Call));
3010 for (auto &CE : Clone->CallerEdges) {
3011 // Ignore any caller that does not have a recorded callsite Call.
3012 if (!CE->Caller->hasCall())
3013 continue;
3014 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3015 }
3016 continue;
3017 }
3018
3019 // We may need to do additional node cloning in this case.
3020 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3021 // that were previously assigned to call PreviousAssignedFuncClone,
3022 // to record that they now call NewFuncClone.
3023 for (auto CE : Clone->CallerEdges) {
3024 // Skip any that have been removed on an earlier iteration.
3025 if (!CE)
3026 continue;
3027 // Ignore any caller that does not have a recorded callsite Call.
3028 if (!CE->Caller->hasCall())
3029 continue;
3030
3031 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3032 // We subsequently fall through to later handling that
3033 // will perform any additional cloning required for
3034 // callers that were calling other function clones.
3035 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3036 PreviousAssignedFuncClone)
3037 continue;
3038
3039 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3040
3041 // If we are cloning a function that was already assigned to some
3042 // callers, then essentially we are creating new callsite clones
3043 // of the other callsites in that function that are reached by those
3044 // callers. Clone the other callees of the current callsite's caller
3045 // that were already assigned to PreviousAssignedFuncClone
3046 // accordingly. This is important since we subsequently update the
3047 // calls from the nodes in the graph and their assignments to callee
3048 // functions recorded in CallsiteToCalleeFuncCloneMap.
3049 for (auto CalleeEdge : CE->Caller->CalleeEdges) {
3050 // Skip any that have been removed on an earlier iteration when
3051 // cleaning up newly None type callee edges.
3052 if (!CalleeEdge)
3053 continue;
3054 ContextNode *Callee = CalleeEdge->Callee;
3055 // Skip the current callsite, we are looking for other
3056 // callsites Caller calls, as well as any that does not have a
3057 // recorded callsite Call.
3058 if (Callee == Clone || !Callee->hasCall())
3059 continue;
3060 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3061 removeNoneTypeCalleeEdges(NewClone);
3062 // Moving the edge may have resulted in some none type
3063 // callee edges on the original Callee.
3064 removeNoneTypeCalleeEdges(Callee);
3065 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3066 // If the Callee node was already assigned to call a specific
3067 // function version, make sure its new clone is assigned to call
3068 // that same function clone.
3069 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3070 RecordCalleeFuncOfCallsite(
3071 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3072 // Update NewClone with the new Call clone of this callsite's Call
3073 // created for the new function clone created earlier.
3074 // Recall that we have already ensured when building the graph
3075 // that each caller can only call callsites within the same
3076 // function, so we are guaranteed that Callee Call is in the
3077 // current OrigFunc.
3078 // CallMap is set up as indexed by original Call at clone 0.
3079 CallInfo OrigCall(Callee->getOrigNode()->Call);
3080 OrigCall.setCloneNo(0);
3081 std::map<CallInfo, CallInfo> &CallMap =
3082 FuncClonesToCallMap[NewFuncClone];
3083 assert(CallMap.count(OrigCall));
3084 CallInfo NewCall(CallMap[OrigCall]);
3085 assert(NewCall);
3086 NewClone->setCall(NewCall);
3087 }
3088 }
3089 // Fall through to handling below to perform the recording of the
3090 // function for this callsite clone. This enables handling of cases
3091 // where the callers were assigned to different clones of a function.
3092 }
3093
3094 // See if we can use existing function clone. Walk through
3095 // all caller edges to see if any have already been assigned to
3096 // a clone of this callsite's function. If we can use it, do so. If not,
3097 // because that function clone is already assigned to a different clone
3098 // of this callsite, then we need to clone again.
3099 // Basically, this checking is needed to handle the case where different
3100 // caller functions/callsites may need versions of this function
3101 // containing different mixes of callsite clones across the different
3102 // callsites within the function. If that happens, we need to create
3103 // additional function clones to handle the various combinations.
3104 //
3105 // Keep track of any new clones of this callsite created by the
3106 // following loop, as well as any existing clone that we decided to
3107 // assign this clone to.
3108 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3109 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3110 // We need to be able to remove Edge from CallerEdges, so need to adjust
3111 // iterator in the loop.
3112 for (auto EI = Clone->CallerEdges.begin();
3113 EI != Clone->CallerEdges.end();) {
3114 auto Edge = *EI;
3115 // Ignore any caller that does not have a recorded callsite Call.
3116 if (!Edge->Caller->hasCall()) {
3117 EI++;
3118 continue;
3119 }
3120 // If this caller already assigned to call a version of OrigFunc, need
3121 // to ensure we can assign this callsite clone to that function clone.
3122 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3123 FuncInfo FuncCloneCalledByCaller =
3124 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3125 // First we need to confirm that this function clone is available
3126 // for use by this callsite node clone.
3127 //
3128 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3129 // its callsite clones, one of those callsite clones X could have
3130 // been assigned to the same function clone called by Edge's caller
3131 // - if Edge's caller calls another callsite within Node's original
3132 // function, and that callsite has another caller reaching clone X.
3133 // We need to clone Node again in this case.
3134 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3135 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3136 Clone) ||
3137 // Detect when we have multiple callers of this callsite that
3138 // have already been assigned to specific, and different, clones
3139 // of OrigFunc (due to other unrelated callsites in Func they
3140 // reach via call contexts). Is this Clone of callsite Node
3141 // assigned to a different clone of OrigFunc? If so, clone Node
3142 // again.
3143 (FuncCloneAssignedToCurCallsiteClone &&
3144 FuncCloneAssignedToCurCallsiteClone !=
3145 FuncCloneCalledByCaller)) {
3146 // We need to use a different newly created callsite clone, in
3147 // order to assign it to another new function clone on a
3148 // subsequent iteration over the Clones array (adjusted below).
3149 // Note we specifically do not reset the
3150 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3151 // when this new clone is processed later we know which version of
3152 // the function to copy (so that other callsite clones we have
3153 // assigned to that function clone are properly cloned over). See
3154 // comments in the function cloning handling earlier.
3155
3156 // Check if we already have cloned this callsite again while
3157 // walking through caller edges, for a caller calling the same
3158 // function clone. If so, we can move this edge to that new clone
3159 // rather than creating yet another new clone.
3160 if (FuncCloneToNewCallsiteCloneMap.count(
3161 FuncCloneCalledByCaller)) {
3162 ContextNode *NewClone =
3163 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3164 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3165 // Cleanup any none type edges cloned over.
3166 removeNoneTypeCalleeEdges(NewClone);
3167 } else {
3168 // Create a new callsite clone.
3169 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3170 removeNoneTypeCalleeEdges(NewClone);
3171 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3172 NewClone;
3173 // Add to list of clones and process later.
3174 ClonesWorklist.push_back(NewClone);
3175 assert(EI == Clone->CallerEdges.end() ||
3176 Clone->AllocTypes != (uint8_t)AllocationType::None);
3177 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3178 }
3179 // Moving the caller edge may have resulted in some none type
3180 // callee edges.
3181 removeNoneTypeCalleeEdges(Clone);
3182 // We will handle the newly created callsite clone in a subsequent
3183 // iteration over this Node's Clones. Continue here since we
3184 // already adjusted iterator EI while moving the edge.
3185 continue;
3186 }
3187
3188 // Otherwise, we can use the function clone already assigned to this
3189 // caller.
3190 if (!FuncCloneAssignedToCurCallsiteClone) {
3191 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3192 // Assign Clone to FuncCloneCalledByCaller
3193 AssignCallsiteCloneToFuncClone(
3194 FuncCloneCalledByCaller, Call, Clone,
3195 AllocationCallToContextNodeMap.count(Call));
3196 } else
3197 // Don't need to do anything - callsite is already calling this
3198 // function clone.
3199 assert(FuncCloneAssignedToCurCallsiteClone ==
3200 FuncCloneCalledByCaller);
3201
3202 } else {
3203 // We have not already assigned this caller to a version of
3204 // OrigFunc. Do the assignment now.
3205
3206 // First check if we have already assigned this callsite clone to a
3207 // clone of OrigFunc for another caller during this iteration over
3208 // its caller edges.
3209 if (!FuncCloneAssignedToCurCallsiteClone) {
3210 // Find first function in FuncClonesToCallMap without an assigned
3211 // clone of this callsite Node. We should always have one
3212 // available at this point due to the earlier cloning when the
3213 // FuncClonesToCallMap size was smaller than the clone number.
3214 for (auto &CF : FuncClonesToCallMap) {
3215 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3216 FuncCloneAssignedToCurCallsiteClone = CF.first;
3217 break;
3218 }
3219 }
3220 assert(FuncCloneAssignedToCurCallsiteClone);
3221 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3222 AssignCallsiteCloneToFuncClone(
3223 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3224 AllocationCallToContextNodeMap.count(Call));
3225 } else
3226 assert(FuncCloneToCurNodeCloneMap
3227 [FuncCloneAssignedToCurCallsiteClone] == Clone);
3228 // Update callers to record function version called.
3229 RecordCalleeFuncOfCallsite(Edge->Caller,
3230 FuncCloneAssignedToCurCallsiteClone);
3231 }
3232
3233 EI++;
3234 }
3235 }
3236 if (VerifyCCG) {
3237 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3238 for (const auto &PE : Node->CalleeEdges)
3239 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3240 for (const auto &CE : Node->CallerEdges)
3241 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3242 for (auto *Clone : Node->Clones) {
3243 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3244 for (const auto &PE : Clone->CalleeEdges)
3245 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3246 for (const auto &CE : Clone->CallerEdges)
3247 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3248 }
3249 }
3250 }
3251 }
3252
3253 auto UpdateCalls = [&](ContextNode *Node,
3255 auto &&UpdateCalls) {
3256 auto Inserted = Visited.insert(Node);
3257 if (!Inserted.second)
3258 return;
3259
3260 for (auto *Clone : Node->Clones)
3261 UpdateCalls(Clone, Visited, UpdateCalls);
3262
3263 for (auto &Edge : Node->CallerEdges)
3264 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3265
3266 // Skip if either no call to update, or if we ended up with no context ids
3267 // (we moved all edges onto other clones).
3268 if (!Node->hasCall() || Node->ContextIds.empty())
3269 return;
3270
3271 if (Node->IsAllocation) {
3272 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3273 return;
3274 }
3275
3276 if (!CallsiteToCalleeFuncCloneMap.count(Node))
3277 return;
3278
3279 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3280 updateCall(Node->Call, CalleeFunc);
3281 };
3282
3283 // Performs DFS traversal starting from allocation nodes to update calls to
3284 // reflect cloning decisions recorded earlier. For regular LTO this will
3285 // update the actual calls in the IR to call the appropriate function clone
3286 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3287 // are recorded in the summary entries.
3289 for (auto &Entry : AllocationCallToContextNodeMap)
3290 UpdateCalls(Entry.second, Visited, UpdateCalls);
3291
3292 return Changed;
3293}
3294
3296 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3298 &FuncToAliasMap) {
3299 // The first "clone" is the original copy, we should only call this if we
3300 // needed to create new clones.
3301 assert(NumClones > 1);
3303 VMaps.reserve(NumClones - 1);
3304 FunctionsClonedThinBackend++;
3305 for (unsigned I = 1; I < NumClones; I++) {
3306 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3307 auto *NewF = CloneFunction(&F, *VMaps.back());
3308 FunctionClonesThinBackend++;
3309 // Strip memprof and callsite metadata from clone as they are no longer
3310 // needed.
3311 for (auto &BB : *NewF) {
3312 for (auto &Inst : BB) {
3313 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3314 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3315 }
3316 }
3317 std::string Name = getMemProfFuncName(F.getName(), I);
3318 auto *PrevF = M.getFunction(Name);
3319 if (PrevF) {
3320 // We might have created this when adjusting callsite in another
3321 // function. It should be a declaration.
3322 assert(PrevF->isDeclaration());
3323 NewF->takeName(PrevF);
3324 PrevF->replaceAllUsesWith(NewF);
3325 PrevF->eraseFromParent();
3326 } else
3327 NewF->setName(Name);
3328 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3329 << "created clone " << ore::NV("NewFunction", NewF));
3330
3331 // Now handle aliases to this function, and clone those as well.
3332 if (!FuncToAliasMap.count(&F))
3333 continue;
3334 for (auto *A : FuncToAliasMap[&F]) {
3335 std::string Name = getMemProfFuncName(A->getName(), I);
3336 auto *PrevA = M.getNamedAlias(Name);
3337 auto *NewA = GlobalAlias::create(A->getValueType(),
3338 A->getType()->getPointerAddressSpace(),
3339 A->getLinkage(), Name, NewF);
3340 NewA->copyAttributesFrom(A);
3341 if (PrevA) {
3342 // We might have created this when adjusting callsite in another
3343 // function. It should be a declaration.
3344 assert(PrevA->isDeclaration());
3345 NewA->takeName(PrevA);
3346 PrevA->replaceAllUsesWith(NewA);
3347 PrevA->eraseFromParent();
3348 }
3349 }
3350 }
3351 return VMaps;
3352}
3353
3354// Locate the summary for F. This is complicated by the fact that it might
3355// have been internalized or promoted.
3357 const ModuleSummaryIndex *ImportSummary) {
3358 // FIXME: Ideally we would retain the original GUID in some fashion on the
3359 // function (e.g. as metadata), but for now do our best to locate the
3360 // summary without that information.
3361 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3362 if (!TheFnVI)
3363 // See if theFn was internalized, by checking index directly with
3364 // original name (this avoids the name adjustment done by getGUID() for
3365 // internal symbols).
3366 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3367 if (TheFnVI)
3368 return TheFnVI;
3369 // Now query with the original name before any promotion was performed.
3370 StringRef OrigName =
3372 std::string OrigId = GlobalValue::getGlobalIdentifier(
3373 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3374 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3375 if (TheFnVI)
3376 return TheFnVI;
3377 // Could be a promoted local imported from another module. We need to pass
3378 // down more info here to find the original module id. For now, try with
3379 // the OrigName which might have been stored in the OidGuidMap in the
3380 // index. This would not work if there were same-named locals in multiple
3381 // modules, however.
3382 auto OrigGUID =
3383 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3384 if (OrigGUID)
3385 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3386 return TheFnVI;
3387}
3388
3389bool MemProfContextDisambiguation::applyImport(Module &M) {
3390 assert(ImportSummary);
3391 bool Changed = false;
3392
3393 auto IsMemProfClone = [](const Function &F) {
3394 return F.getName().contains(MemProfCloneSuffix);
3395 };
3396
3397 // We also need to clone any aliases that reference cloned functions, because
3398 // the modified callsites may invoke via the alias. Keep track of the aliases
3399 // for each function.
3400 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3401 FuncToAliasMap;
3402 for (auto &A : M.aliases()) {
3403 auto *Aliasee = A.getAliaseeObject();
3404 if (auto *F = dyn_cast<Function>(Aliasee))
3405 FuncToAliasMap[F].insert(&A);
3406 }
3407
3408 for (auto &F : M) {
3409 if (F.isDeclaration() || IsMemProfClone(F))
3410 continue;
3411
3413
3415 bool ClonesCreated = false;
3416 unsigned NumClonesCreated = 0;
3417 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3418 // We should at least have version 0 which is the original copy.
3419 assert(NumClones > 0);
3420 // If only one copy needed use original.
3421 if (NumClones == 1)
3422 return;
3423 // If we already performed cloning of this function, confirm that the
3424 // requested number of clones matches (the thin link should ensure the
3425 // number of clones for each constituent callsite is consistent within
3426 // each function), before returning.
3427 if (ClonesCreated) {
3428 assert(NumClonesCreated == NumClones);
3429 return;
3430 }
3431 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3432 // The first "clone" is the original copy, which doesn't have a VMap.
3433 assert(VMaps.size() == NumClones - 1);
3434 Changed = true;
3435 ClonesCreated = true;
3436 NumClonesCreated = NumClones;
3437 };
3438
3439 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3440 Function *CalledFunction) {
3441 // Perform cloning if not yet done.
3442 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3443
3444 // Should have skipped indirect calls via mayHaveMemprofSummary.
3445 assert(CalledFunction);
3446 assert(!IsMemProfClone(*CalledFunction));
3447
3448 // Update the calls per the summary info.
3449 // Save orig name since it gets updated in the first iteration
3450 // below.
3451 auto CalleeOrigName = CalledFunction->getName();
3452 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3453 // Do nothing if this version calls the original version of its
3454 // callee.
3455 if (!StackNode.Clones[J])
3456 continue;
3457 auto NewF = M.getOrInsertFunction(
3458 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3459 CalledFunction->getFunctionType());
3460 CallBase *CBClone;
3461 // Copy 0 is the original function.
3462 if (!J)
3463 CBClone = CB;
3464 else
3465 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3466 CBClone->setCalledFunction(NewF);
3467 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3468 << ore::NV("Call", CBClone) << " in clone "
3469 << ore::NV("Caller", CBClone->getFunction())
3470 << " assigned to call function clone "
3471 << ore::NV("Callee", NewF.getCallee()));
3472 }
3473 };
3474
3475 // Locate the summary for F.
3476 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3477 // If not found, this could be an imported local (see comment in
3478 // findValueInfoForFunc). Skip for now as it will be cloned in its original
3479 // module (where it would have been promoted to global scope so should
3480 // satisfy any reference in this module).
3481 if (!TheFnVI)
3482 continue;
3483
3484 auto *GVSummary =
3485 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3486 if (!GVSummary) {
3487 // Must have been imported, use the summary which matches the definition。
3488 // (might be multiple if this was a linkonce_odr).
3489 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
3490 assert(SrcModuleMD &&
3491 "enable-import-metadata is needed to emit thinlto_src_module");
3492 StringRef SrcModule =
3493 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
3494 for (auto &GVS : TheFnVI.getSummaryList()) {
3495 if (GVS->modulePath() == SrcModule) {
3496 GVSummary = GVS.get();
3497 break;
3498 }
3499 }
3500 assert(GVSummary && GVSummary->modulePath() == SrcModule);
3501 }
3502
3503 // If this was an imported alias skip it as we won't have the function
3504 // summary, and it should be cloned in the original module.
3505 if (isa<AliasSummary>(GVSummary))
3506 continue;
3507
3508 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3509
3510 if (FS->allocs().empty() && FS->callsites().empty())
3511 continue;
3512
3513 auto SI = FS->callsites().begin();
3514 auto AI = FS->allocs().begin();
3515
3516 // To handle callsite infos synthesized for tail calls which have missing
3517 // frames in the profiled context, map callee VI to the synthesized callsite
3518 // info.
3519 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3520 // Iterate the callsites for this function in reverse, since we place all
3521 // those synthesized for tail calls at the end.
3522 for (auto CallsiteIt = FS->callsites().rbegin();
3523 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3524 auto &Callsite = *CallsiteIt;
3525 // Stop as soon as we see a non-synthesized callsite info (see comment
3526 // above loop). All the entries added for discovered tail calls have empty
3527 // stack ids.
3528 if (!Callsite.StackIdIndices.empty())
3529 break;
3530 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3531 }
3532
3533 // Assume for now that the instructions are in the exact same order
3534 // as when the summary was created, but confirm this is correct by
3535 // matching the stack ids.
3536 for (auto &BB : F) {
3537 for (auto &I : BB) {
3538 auto *CB = dyn_cast<CallBase>(&I);
3539 // Same handling as when creating module summary.
3540 if (!mayHaveMemprofSummary(CB))
3541 continue;
3542
3543 auto *CalledValue = CB->getCalledOperand();
3544 auto *CalledFunction = CB->getCalledFunction();
3545 if (CalledValue && !CalledFunction) {
3546 CalledValue = CalledValue->stripPointerCasts();
3547 // Stripping pointer casts can reveal a called function.
3548 CalledFunction = dyn_cast<Function>(CalledValue);
3549 }
3550 // Check if this is an alias to a function. If so, get the
3551 // called aliasee for the checks below.
3552 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3553 assert(!CalledFunction &&
3554 "Expected null called function in callsite for alias");
3555 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3556 }
3557
3559 I.getMetadata(LLVMContext::MD_callsite));
3560 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3561
3562 // Include allocs that were already assigned a memprof function
3563 // attribute in the statistics.
3564 if (CB->getAttributes().hasFnAttr("memprof")) {
3565 assert(!MemProfMD);
3566 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3567 ? AllocTypeColdThinBackend++
3568 : AllocTypeNotColdThinBackend++;
3569 OrigAllocsThinBackend++;
3570 AllocVersionsThinBackend++;
3571 if (!MaxAllocVersionsThinBackend)
3572 MaxAllocVersionsThinBackend = 1;
3573 // Remove any remaining callsite metadata and we can skip the rest of
3574 // the handling for this instruction, since no cloning needed.
3575 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3576 continue;
3577 }
3578
3579 if (MemProfMD) {
3580 // Consult the next alloc node.
3581 assert(AI != FS->allocs().end());
3582 auto &AllocNode = *(AI++);
3583
3584 // Sanity check that the MIB stack ids match between the summary and
3585 // instruction metadata.
3586 auto MIBIter = AllocNode.MIBs.begin();
3587 for (auto &MDOp : MemProfMD->operands()) {
3588 assert(MIBIter != AllocNode.MIBs.end());
3589 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3590 MIBIter->StackIdIndices.begin();
3591 auto *MIBMD = cast<const MDNode>(MDOp);
3592 MDNode *StackMDNode = getMIBStackNode(MIBMD);
3593 assert(StackMDNode);
3594 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3595 auto ContextIterBegin =
3596 StackContext.beginAfterSharedPrefix(CallsiteContext);
3597 // Skip the checking on the first iteration.
3598 uint64_t LastStackContextId =
3599 (ContextIterBegin != StackContext.end() &&
3600 *ContextIterBegin == 0)
3601 ? 1
3602 : 0;
3603 for (auto ContextIter = ContextIterBegin;
3604 ContextIter != StackContext.end(); ++ContextIter) {
3605 // If this is a direct recursion, simply skip the duplicate
3606 // entries, to be consistent with how the summary ids were
3607 // generated during ModuleSummaryAnalysis.
3608 if (LastStackContextId == *ContextIter)
3609 continue;
3610 LastStackContextId = *ContextIter;
3611 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3612 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3613 *ContextIter);
3614 StackIdIndexIter++;
3615 }
3616 MIBIter++;
3617 }
3618
3619 // Perform cloning if not yet done.
3620 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3621
3622 OrigAllocsThinBackend++;
3623 AllocVersionsThinBackend += AllocNode.Versions.size();
3624 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3625 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3626
3627 // If there is only one version that means we didn't end up
3628 // considering this function for cloning, and in that case the alloc
3629 // will still be none type or should have gotten the default NotCold.
3630 // Skip that after calling clone helper since that does some sanity
3631 // checks that confirm we haven't decided yet that we need cloning.
3632 if (AllocNode.Versions.size() == 1) {
3633 assert((AllocationType)AllocNode.Versions[0] ==
3635 (AllocationType)AllocNode.Versions[0] ==
3637 UnclonableAllocsThinBackend++;
3638 continue;
3639 }
3640
3641 // All versions should have a singular allocation type.
3642 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3643 return Type == ((uint8_t)AllocationType::NotCold |
3644 (uint8_t)AllocationType::Cold);
3645 }));
3646
3647 // Update the allocation types per the summary info.
3648 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3649 // Ignore any that didn't get an assigned allocation type.
3650 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3651 continue;
3652 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3653 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3654 : AllocTypeNotColdThinBackend++;
3655 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3656 auto A = llvm::Attribute::get(F.getContext(), "memprof",
3657 AllocTypeString);
3658 CallBase *CBClone;
3659 // Copy 0 is the original function.
3660 if (!J)
3661 CBClone = CB;
3662 else
3663 // Since VMaps are only created for new clones, we index with
3664 // clone J-1 (J==0 is the original clone and does not have a VMaps
3665 // entry).
3666 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3667 CBClone->addFnAttr(A);
3668 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3669 << ore::NV("AllocationCall", CBClone) << " in clone "
3670 << ore::NV("Caller", CBClone->getFunction())
3671 << " marked with memprof allocation attribute "
3672 << ore::NV("Attribute", AllocTypeString));
3673 }
3674 } else if (!CallsiteContext.empty()) {
3675 // Consult the next callsite node.
3676 assert(SI != FS->callsites().end());
3677 auto &StackNode = *(SI++);
3678
3679#ifndef NDEBUG
3680 // Sanity check that the stack ids match between the summary and
3681 // instruction metadata.
3682 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3683 for (auto StackId : CallsiteContext) {
3684 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3685 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3686 StackId);
3687 StackIdIndexIter++;
3688 }
3689#endif
3690
3691 CloneCallsite(StackNode, CB, CalledFunction);
3692 } else if (CB->isTailCall()) {
3693 // Locate the synthesized callsite info for the callee VI, if any was
3694 // created, and use that for cloning.
3695 ValueInfo CalleeVI =
3696 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3697 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3698 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3699 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3700 CloneCallsite(Callsite->second, CB, CalledFunction);
3701 }
3702 }
3703 // Memprof and callsite metadata on memory allocations no longer needed.
3704 I.setMetadata(LLVMContext::MD_memprof, nullptr);
3705 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3706 }
3707 }
3708 }
3709
3710 return Changed;
3711}
3712
3713template <typename DerivedCCG, typename FuncTy, typename CallTy>
3714bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3715 if (DumpCCG) {
3716 dbgs() << "CCG before cloning:\n";
3717 dbgs() << *this;
3718 }
3719 if (ExportToDot)
3720 exportToDot("postbuild");
3721
3722 if (VerifyCCG) {
3723 check();
3724 }
3725
3726 identifyClones();
3727
3728 if (VerifyCCG) {
3729 check();
3730 }
3731
3732 if (DumpCCG) {
3733 dbgs() << "CCG after cloning:\n";
3734 dbgs() << *this;
3735 }
3736 if (ExportToDot)
3737 exportToDot("cloned");
3738
3739 bool Changed = assignFunctions();
3740
3741 if (DumpCCG) {
3742 dbgs() << "CCG after assigning function clones:\n";
3743 dbgs() << *this;
3744 }
3745 if (ExportToDot)
3746 exportToDot("clonefuncassign");
3747
3748 return Changed;
3749}
3750
3751bool MemProfContextDisambiguation::processModule(
3752 Module &M,
3754
3755 // If we have an import summary, then the cloning decisions were made during
3756 // the thin link on the index. Apply them and return.
3757 if (ImportSummary)
3758 return applyImport(M);
3759
3760 // TODO: If/when other types of memprof cloning are enabled beyond just for
3761 // hot and cold, we will need to change this to individually control the
3762 // AllocationType passed to addStackNodesForMIB during CCG construction.
3763 // Note that we specifically check this after applying imports above, so that
3764 // the option isn't needed to be passed to distributed ThinLTO backend
3765 // clang processes, which won't necessarily have visibility into the linker
3766 // dependences. Instead the information is communicated from the LTO link to
3767 // the backends via the combined summary index.
3768 if (!SupportsHotColdNew)
3769 return false;
3770
3771 ModuleCallsiteContextGraph CCG(M, OREGetter);
3772 return CCG.process();
3773}
3774
3776 const ModuleSummaryIndex *Summary)
3777 : ImportSummary(Summary) {
3778 if (ImportSummary) {
3779 // The MemProfImportSummary should only be used for testing ThinLTO
3780 // distributed backend handling via opt, in which case we don't have a
3781 // summary from the pass pipeline.
3783 return;
3784 }
3785 if (MemProfImportSummary.empty())
3786 return;
3787
3788 auto ReadSummaryFile =
3790 if (!ReadSummaryFile) {
3791 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3792 "Error loading file '" + MemProfImportSummary +
3793 "': ");
3794 return;
3795 }
3796 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3797 if (!ImportSummaryForTestingOrErr) {
3798 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3799 "Error parsing file '" + MemProfImportSummary +
3800 "': ");
3801 return;
3802 }
3803 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3804 ImportSummary = ImportSummaryForTesting.get();
3805}
3806
3809 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3810 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3812 };
3813 if (!processModule(M, OREGetter))
3814 return PreservedAnalyses::all();
3815 return PreservedAnalyses::none();
3816}
3817
3821 isPrevailing) {
3822 // TODO: If/when other types of memprof cloning are enabled beyond just for
3823 // hot and cold, we will need to change this to individually control the
3824 // AllocationType passed to addStackNodesForMIB during CCG construction.
3825 // The index was set from the option, so these should be in sync.
3826 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3827 if (!SupportsHotColdNew)
3828 return;
3829
3830 IndexCallsiteContextGraph CCG(Index, isPrevailing);
3831 CCG.process();
3832}
aarch64 promote const
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
#define check(cond)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
Module.h This file contains the declarations for the Module class.
#define P(N)
Module * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
void print(OutputBuffer &OB) const
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1851
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1781
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Function summary information to aid decisions and implementation of importing.
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:525
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:409
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:595
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:169
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:631
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:86
Metadata node.
Definition: Metadata.h:1067
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_t size() const
Definition: SmallVector.h:91
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:105
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
StringRef AttributeString(unsigned Attribute)
Definition: Dwarf.cpp:72
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:206
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
Definition: SetOperations.h:82
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:23
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:60
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1198
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:80
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const