LLVM 20.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/Module.h"
39#include "llvm/Pass.h"
43#include "llvm/Transforms/IPO.h"
47#include <deque>
48#include <sstream>
49#include <unordered_map>
50#include <vector>
51using namespace llvm;
52using namespace llvm::memprof;
53
54#define DEBUG_TYPE "memprof-context-disambiguation"
55
56STATISTIC(FunctionClonesAnalysis,
57 "Number of function clones created during whole program analysis");
58STATISTIC(FunctionClonesThinBackend,
59 "Number of function clones created during ThinLTO backend");
60STATISTIC(FunctionsClonedThinBackend,
61 "Number of functions that had clones created during ThinLTO backend");
62STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63 "cloned) during whole program analysis");
64STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65 "during whole program analysis");
66STATISTIC(AllocTypeNotColdThinBackend,
67 "Number of not cold static allocations (possibly cloned) during "
68 "ThinLTO backend");
69STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70 "(possibly cloned) during ThinLTO backend");
71STATISTIC(OrigAllocsThinBackend,
72 "Number of original (not cloned) allocations with memprof profiles "
73 "during ThinLTO backend");
75 AllocVersionsThinBackend,
76 "Number of allocation versions (including clones) during ThinLTO backend");
77STATISTIC(MaxAllocVersionsThinBackend,
78 "Maximum number of allocation versions created for an original "
79 "allocation during ThinLTO backend");
80STATISTIC(UnclonableAllocsThinBackend,
81 "Number of unclonable ambigous allocations during ThinLTO backend");
82STATISTIC(RemovedEdgesWithMismatchedCallees,
83 "Number of edges removed due to mismatched callees (profiled vs IR)");
84STATISTIC(FoundProfiledCalleeCount,
85 "Number of profiled callees found via tail calls");
86STATISTIC(FoundProfiledCalleeDepth,
87 "Aggregate depth of profiled callees found via tail calls");
88STATISTIC(FoundProfiledCalleeMaxDepth,
89 "Maximum depth of profiled callees found via tail calls");
90STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91 "Number of profiled callees found via multiple tail call chains");
92
94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95 cl::value_desc("filename"),
96 cl::desc("Specify the path prefix of the MemProf dot files."));
97
98static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
100 cl::desc("Export graph to dot files."));
101
102static cl::opt<bool>
103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104 cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106static cl::opt<bool>
107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108 cl::desc("Perform verification checks on CallingContextGraph."));
109
110static cl::opt<bool>
111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112 cl::desc("Perform frequent verification checks on nodes."));
113
115 "memprof-import-summary",
116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117 cl::Hidden);
118
120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
122 cl::desc("Max depth to recursively search for missing "
123 "frames through tail calls."));
124
125// Optionally enable cloning of callsites involved with recursive cycles
127 "memprof-allow-recursive-callsites", cl::init(false), cl::Hidden,
128 cl::desc("Allow cloning of callsites involved in recursive cycles"));
129
130// When disabled, try to detect and prevent cloning of recursive contexts.
131// This is only necessary until we support cloning through recursive cycles.
132// Leave on by default for now, as disabling requires a little bit of compile
133// time overhead and doesn't affect correctness, it will just inflate the cold
134// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
136 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
137 cl::desc("Allow cloning of contexts through recursive cycles"));
138
139namespace llvm {
141 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
142 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
143
144// Indicate we are linking with an allocator that supports hot/cold operator
145// new interfaces.
147 "supports-hot-cold-new", cl::init(false), cl::Hidden,
148 cl::desc("Linking with hot/cold operator new interfaces"));
149
151 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
152 cl::desc(
153 "Require target function definition when promoting indirect calls"));
154} // namespace llvm
155
158
159namespace {
160/// CRTP base for graphs built from either IR or ThinLTO summary index.
161///
162/// The graph represents the call contexts in all memprof metadata on allocation
163/// calls, with nodes for the allocations themselves, as well as for the calls
164/// in each context. The graph is initially built from the allocation memprof
165/// metadata (or summary) MIBs. It is then updated to match calls with callsite
166/// metadata onto the nodes, updating it to reflect any inlining performed on
167/// those calls.
168///
169/// Each MIB (representing an allocation's call context with allocation
170/// behavior) is assigned a unique context id during the graph build. The edges
171/// and nodes in the graph are decorated with the context ids they carry. This
172/// is used to correctly update the graph when cloning is performed so that we
173/// can uniquify the context for a single (possibly cloned) allocation.
174template <typename DerivedCCG, typename FuncTy, typename CallTy>
175class CallsiteContextGraph {
176public:
177 CallsiteContextGraph() = default;
178 CallsiteContextGraph(const CallsiteContextGraph &) = default;
179 CallsiteContextGraph(CallsiteContextGraph &&) = default;
180
181 /// Main entry point to perform analysis and transformations on graph.
182 bool process();
183
184 /// Perform cloning on the graph necessary to uniquely identify the allocation
185 /// behavior of an allocation based on its context.
186 void identifyClones();
187
188 /// Assign callsite clones to functions, cloning functions as needed to
189 /// accommodate the combinations of their callsite clones reached by callers.
190 /// For regular LTO this clones functions and callsites in the IR, but for
191 /// ThinLTO the cloning decisions are noted in the summaries and later applied
192 /// in applyImport.
193 bool assignFunctions();
194
195 void dump() const;
196 void print(raw_ostream &OS) const;
197 void printTotalSizes(raw_ostream &OS) const;
198
200 const CallsiteContextGraph &CCG) {
201 CCG.print(OS);
202 return OS;
203 }
204
205 friend struct GraphTraits<
206 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
207 friend struct DOTGraphTraits<
208 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
209
210 void exportToDot(std::string Label) const;
211
212 /// Represents a function clone via FuncTy pointer and clone number pair.
213 struct FuncInfo final
214 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
215 using Base = std::pair<FuncTy *, unsigned>;
216 FuncInfo(const Base &B) : Base(B) {}
217 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
218 explicit operator bool() const { return this->first != nullptr; }
219 FuncTy *func() const { return this->first; }
220 unsigned cloneNo() const { return this->second; }
221 };
222
223 /// Represents a callsite clone via CallTy and clone number pair.
224 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
225 using Base = std::pair<CallTy, unsigned>;
226 CallInfo(const Base &B) : Base(B) {}
227 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
228 : Base(Call, CloneNo) {}
229 explicit operator bool() const { return (bool)this->first; }
230 CallTy call() const { return this->first; }
231 unsigned cloneNo() const { return this->second; }
232 void setCloneNo(unsigned N) { this->second = N; }
233 void print(raw_ostream &OS) const {
234 if (!operator bool()) {
235 assert(!cloneNo());
236 OS << "null Call";
237 return;
238 }
239 call()->print(OS);
240 OS << "\t(clone " << cloneNo() << ")";
241 }
242 void dump() const {
243 print(dbgs());
244 dbgs() << "\n";
245 }
246 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
247 Call.print(OS);
248 return OS;
249 }
250 };
251
252 struct ContextEdge;
253
254 /// Node in the Callsite Context Graph
255 struct ContextNode {
256 // Keep this for now since in the IR case where we have an Instruction* it
257 // is not as immediately discoverable. Used for printing richer information
258 // when dumping graph.
259 bool IsAllocation;
260
261 // Keeps track of when the Call was reset to null because there was
262 // recursion.
263 bool Recursive = false;
264
265 // This will be formed by ORing together the AllocationType enum values
266 // for contexts including this node.
267 uint8_t AllocTypes = 0;
268
269 // The corresponding allocation or interior call. This is the primary call
270 // for which we have created this node.
272
273 // List of other calls that can be treated the same as the primary call
274 // through cloning. I.e. located in the same function and have the same
275 // (possibly pruned) stack ids. They will be updated the same way as the
276 // primary call when assigning to function clones.
277 SmallVector<CallInfo, 0> MatchingCalls;
278
279 // For alloc nodes this is a unique id assigned when constructed, and for
280 // callsite stack nodes it is the original stack id when the node is
281 // constructed from the memprof MIB metadata on the alloc nodes. Note that
282 // this is only used when matching callsite metadata onto the stack nodes
283 // created when processing the allocation memprof MIBs, and for labeling
284 // nodes in the dot graph. Therefore we don't bother to assign a value for
285 // clones.
286 uint64_t OrigStackOrAllocId = 0;
287
288 // Edges to all callees in the profiled call stacks.
289 // TODO: Should this be a map (from Callee node) for more efficient lookup?
290 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
291
292 // Edges to all callers in the profiled call stacks.
293 // TODO: Should this be a map (from Caller node) for more efficient lookup?
294 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
295
296 // Get the list of edges from which we can compute allocation information
297 // such as the context ids and allocation type of this node.
298 const std::vector<std::shared_ptr<ContextEdge>> *
299 getEdgesWithAllocInfo() const {
300 // If node has any callees, compute from those, otherwise compute from
301 // callers (i.e. if this is the leaf allocation node).
302 if (!CalleeEdges.empty())
303 return &CalleeEdges;
304 if (!CallerEdges.empty()) {
305 // A node with caller edges but no callee edges must be the allocation
306 // node.
307 assert(IsAllocation);
308 return &CallerEdges;
309 }
310 return nullptr;
311 }
312
313 // Compute the context ids for this node from the union of its edge context
314 // ids.
315 DenseSet<uint32_t> getContextIds() const {
316 DenseSet<uint32_t> ContextIds;
317 auto *Edges = getEdgesWithAllocInfo();
318 if (!Edges)
319 return {};
320 unsigned Count = 0;
321 for (auto &Edge : *Edges)
322 Count += Edge->getContextIds().size();
323 ContextIds.reserve(Count);
324 for (auto &Edge : *Edges)
325 ContextIds.insert(Edge->getContextIds().begin(),
326 Edge->getContextIds().end());
327 return ContextIds;
328 }
329
330 // Compute the allocation type for this node from the OR of its edge
331 // allocation types.
332 uint8_t computeAllocType() const {
333 auto *Edges = getEdgesWithAllocInfo();
334 if (!Edges)
335 return (uint8_t)AllocationType::None;
336 uint8_t BothTypes =
337 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
338 uint8_t AllocType = (uint8_t)AllocationType::None;
339 for (auto &Edge : *Edges) {
340 AllocType |= Edge->AllocTypes;
341 // Bail early if alloc type reached both, no further refinement.
342 if (AllocType == BothTypes)
343 return AllocType;
344 }
345 return AllocType;
346 }
347
348 // The context ids set for this node is empty if its edge context ids are
349 // also all empty.
350 bool emptyContextIds() const {
351 auto *Edges = getEdgesWithAllocInfo();
352 if (!Edges)
353 return true;
354 for (auto &Edge : *Edges) {
355 if (!Edge->getContextIds().empty())
356 return false;
357 }
358 return true;
359 }
360
361 // List of clones of this ContextNode, initially empty.
362 std::vector<ContextNode *> Clones;
363
364 // If a clone, points to the original uncloned node.
365 ContextNode *CloneOf = nullptr;
366
367 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
368
369 ContextNode(bool IsAllocation, CallInfo C)
370 : IsAllocation(IsAllocation), Call(C) {}
371
372 void addClone(ContextNode *Clone) {
373 if (CloneOf) {
374 CloneOf->Clones.push_back(Clone);
375 Clone->CloneOf = CloneOf;
376 } else {
377 Clones.push_back(Clone);
378 assert(!Clone->CloneOf);
379 Clone->CloneOf = this;
380 }
381 }
382
383 ContextNode *getOrigNode() {
384 if (!CloneOf)
385 return this;
386 return CloneOf;
387 }
388
389 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
390 unsigned int ContextId);
391
392 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
393 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
394 void eraseCalleeEdge(const ContextEdge *Edge);
395 void eraseCallerEdge(const ContextEdge *Edge);
396
397 void setCall(CallInfo C) { Call = C; }
398
399 bool hasCall() const { return (bool)Call.call(); }
400
401 void printCall(raw_ostream &OS) const { Call.print(OS); }
402
403 // True if this node was effectively removed from the graph, in which case
404 // it should have an allocation type of None and empty context ids.
405 bool isRemoved() const {
406 assert((AllocTypes == (uint8_t)AllocationType::None) ==
407 emptyContextIds());
408 return AllocTypes == (uint8_t)AllocationType::None;
409 }
410
411 void dump() const;
412 void print(raw_ostream &OS) const;
413
414 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
415 Node.print(OS);
416 return OS;
417 }
418 };
419
420 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
421 /// callee.
422 struct ContextEdge {
423 ContextNode *Callee;
424 ContextNode *Caller;
425
426 // This will be formed by ORing together the AllocationType enum values
427 // for contexts including this edge.
428 uint8_t AllocTypes = 0;
429
430 // The set of IDs for contexts including this edge.
431 DenseSet<uint32_t> ContextIds;
432
433 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
434 DenseSet<uint32_t> ContextIds)
435 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
436 ContextIds(std::move(ContextIds)) {}
437
438 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
439
440 // Helper to clear the fields of this edge when we are removing it from the
441 // graph.
442 inline void clear() {
443 ContextIds.clear();
444 AllocTypes = (uint8_t)AllocationType::None;
445 Caller = nullptr;
446 Callee = nullptr;
447 }
448
449 // Check if edge was removed from the graph. This is useful while iterating
450 // over a copy of edge lists when performing operations that mutate the
451 // graph in ways that might remove one of the edges.
452 inline bool isRemoved() const {
453 if (Callee || Caller)
454 return false;
455 // Any edges that have been removed from the graph but are still in a
456 // shared_ptr somewhere should have all fields null'ed out by clear()
457 // above.
458 assert(AllocTypes == (uint8_t)AllocationType::None);
459 assert(ContextIds.empty());
460 return true;
461 }
462
463 void dump() const;
464 void print(raw_ostream &OS) const;
465
466 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
467 Edge.print(OS);
468 return OS;
469 }
470 };
471
472 /// Helpers to remove edges that have allocation type None (due to not
473 /// carrying any context ids) after transformations.
474 void removeNoneTypeCalleeEdges(ContextNode *Node);
475 void removeNoneTypeCallerEdges(ContextNode *Node);
476 void
477 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
479
480protected:
481 /// Get a list of nodes corresponding to the stack ids in the given callsite
482 /// context.
483 template <class NodeT, class IteratorT>
484 std::vector<uint64_t>
485 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
486
487 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
488 /// metadata (or summary).
489 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
490
491 /// Adds nodes for the given MIB stack ids.
492 template <class NodeT, class IteratorT>
493 void addStackNodesForMIB(ContextNode *AllocNode,
494 CallStack<NodeT, IteratorT> &StackContext,
495 CallStack<NodeT, IteratorT> &CallsiteContext,
497 ArrayRef<ContextTotalSize> ContextSizeInfo);
498
499 /// Matches all callsite metadata (or summary) to the nodes created for
500 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
501 /// inlining performed on those callsite instructions.
502 void updateStackNodes();
503
504 /// Update graph to conservatively handle any callsite stack nodes that target
505 /// multiple different callee target functions.
506 void handleCallsitesWithMultipleTargets();
507
508 // Try to partition calls on the given node (already placed into the AllCalls
509 // array) by callee function, creating new copies of Node as needed to hold
510 // calls with different callees, and moving the callee edges appropriately.
511 // Returns true if partitioning was successful.
512 bool partitionCallsByCallee(
513 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
514 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
515
516 /// Save lists of calls with MemProf metadata in each function, for faster
517 /// iteration.
518 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
519
520 /// Map from callsite node to the enclosing caller function.
521 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
522
523private:
524 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
525
526 // Structure to keep track of information for each call as we are matching
527 // non-allocation callsites onto context nodes created from the allocation
528 // call metadata / summary contexts.
529 struct CallContextInfo {
530 // The callsite we're trying to match.
531 CallTy Call;
532 // The callsites stack ids that have a context node in the graph.
533 std::vector<uint64_t> StackIds;
534 // The function containing this callsite.
535 const FuncTy *Func;
536 // Initially empty, if needed this will be updated to contain the context
537 // ids for use in a new context node created for this callsite.
538 DenseSet<uint32_t> ContextIds;
539 };
540
541 /// Helper to remove edge from graph, updating edge iterator if it is provided
542 /// (in which case CalleeIter indicates which edge list is being iterated).
543 /// This will also perform the necessary clearing of the ContextEdge members
544 /// to enable later checking if the edge has been removed (since we may have
545 /// other copies of the shared_ptr in existence, and in fact rely on this to
546 /// enable removal while iterating over a copy of a node's edge list).
547 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
548 bool CalleeIter = true);
549
550 /// Assigns the given Node to calls at or inlined into the location with
551 /// the Node's stack id, after post order traversing and processing its
552 /// caller nodes. Uses the call information recorded in the given
553 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
554 /// as needed. Called by updateStackNodes which sets up the given
555 /// StackIdToMatchingCalls map.
556 void assignStackNodesPostOrder(
557 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
558 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
559 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
560
561 /// Duplicates the given set of context ids, updating the provided
562 /// map from each original id with the newly generated context ids,
563 /// and returning the new duplicated id set.
564 DenseSet<uint32_t> duplicateContextIds(
565 const DenseSet<uint32_t> &StackSequenceContextIds,
566 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
567
568 /// Propagates all duplicated context ids across the graph.
569 void propagateDuplicateContextIds(
570 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
571
572 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
573 /// else to its callers. Also updates OrigNode's edges to remove any context
574 /// ids moved to the newly created edge.
575 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
576 bool TowardsCallee,
577 DenseSet<uint32_t> RemainingContextIds);
578
579 /// Get the stack id corresponding to the given Id or Index (for IR this will
580 /// return itself, for a summary index this will return the id recorded in the
581 /// index for that stack id index value).
582 uint64_t getStackId(uint64_t IdOrIndex) const {
583 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
584 }
585
586 /// Returns true if the given call targets the callee of the given edge, or if
587 /// we were able to identify the call chain through intermediate tail calls.
588 /// In the latter case new context nodes are added to the graph for the
589 /// identified tail calls, and their synthesized nodes are added to
590 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
591 /// the updated edges and to prepare it for an increment in the caller.
592 bool
593 calleesMatch(CallTy Call, EdgeIter &EI,
594 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
595
596 // Return the callee function of the given call, or nullptr if it can't be
597 // determined
598 const FuncTy *getCalleeFunc(CallTy Call) {
599 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
600 }
601
602 /// Returns true if the given call targets the given function, or if we were
603 /// able to identify the call chain through intermediate tail calls (in which
604 /// case FoundCalleeChain will be populated).
605 bool calleeMatchesFunc(
606 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
607 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
608 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
609 Call, Func, CallerFunc, FoundCalleeChain);
610 }
611
612 /// Returns true if both call instructions have the same callee.
613 bool sameCallee(CallTy Call1, CallTy Call2) {
614 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
615 }
616
617 /// Get a list of nodes corresponding to the stack ids in the given
618 /// callsite's context.
619 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
620 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
621 Call);
622 }
623
624 /// Get the last stack id in the context for callsite.
625 uint64_t getLastStackId(CallTy Call) {
626 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
627 }
628
629 /// Update the allocation call to record type of allocated memory.
630 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
631 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
632 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
633 }
634
635 /// Get the AllocationType assigned to the given allocation instruction clone.
636 AllocationType getAllocationCallType(const CallInfo &Call) const {
637 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
638 }
639
640 /// Update non-allocation call to invoke (possibly cloned) function
641 /// CalleeFunc.
642 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
643 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
644 }
645
646 /// Clone the given function for the given callsite, recording mapping of all
647 /// of the functions tracked calls to their new versions in the CallMap.
648 /// Assigns new clones to clone number CloneNo.
649 FuncInfo cloneFunctionForCallsite(
650 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
651 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
652 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
653 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
654 }
655
656 /// Gets a label to use in the dot graph for the given call clone in the given
657 /// function.
658 std::string getLabel(const FuncTy *Func, const CallTy Call,
659 unsigned CloneNo) const {
660 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
661 }
662
663 // Create and return a new ContextNode.
664 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
665 CallInfo C = CallInfo()) {
666 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
667 auto *NewNode = NodeOwner.back().get();
668 if (F)
669 NodeToCallingFunc[NewNode] = F;
670 return NewNode;
671 }
672
673 /// Helpers to find the node corresponding to the given call or stackid.
674 ContextNode *getNodeForInst(const CallInfo &C);
675 ContextNode *getNodeForAlloc(const CallInfo &C);
676 ContextNode *getNodeForStackId(uint64_t StackId);
677
678 /// Computes the alloc type corresponding to the given context ids, by
679 /// unioning their recorded alloc types.
680 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
681
682 /// Returns the allocation type of the intersection of the contexts of two
683 /// nodes (based on their provided context id sets), optimized for the case
684 /// when Node1Ids is smaller than Node2Ids.
685 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
686 const DenseSet<uint32_t> &Node2Ids);
687
688 /// Returns the allocation type of the intersection of the contexts of two
689 /// nodes (based on their provided context id sets).
690 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
691 const DenseSet<uint32_t> &Node2Ids);
692
693 /// Create a clone of Edge's callee and move Edge to that new callee node,
694 /// performing the necessary context id and allocation type updates.
695 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
696 /// moved to an edge to the new callee.
697 ContextNode *
698 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
699 DenseSet<uint32_t> ContextIdsToMove = {});
700
701 /// Change the callee of Edge to existing callee clone NewCallee, performing
702 /// the necessary context id and allocation type updates.
703 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
704 /// moved to an edge to the new callee.
705 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
706 ContextNode *NewCallee,
707 bool NewClone = false,
708 DenseSet<uint32_t> ContextIdsToMove = {});
709
710 /// Change the caller of the edge at the given callee edge iterator to be
711 /// NewCaller, performing the necessary context id and allocation type
712 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
713 /// a simplified version of it as we always move the given edge and all of its
714 /// context ids.
715 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
716 ContextNode *NewCaller);
717
718 /// Recursively perform cloning on the graph for the given Node and its
719 /// callers, in order to uniquely identify the allocation behavior of an
720 /// allocation given its context. The context ids of the allocation being
721 /// processed are given in AllocContextIds.
722 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
723 const DenseSet<uint32_t> &AllocContextIds);
724
725 /// Map from each context ID to the AllocationType assigned to that context.
726 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
727
728 /// Map from each contextID to the profiled full contexts and their total
729 /// sizes (there may be more than one due to context trimming),
730 /// optionally populated when requested (via MemProfReportHintedSizes or
731 /// MinClonedColdBytePercent).
732 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
733
734 /// Identifies the context node created for a stack id when adding the MIB
735 /// contexts to the graph. This is used to locate the context nodes when
736 /// trying to assign the corresponding callsites with those stack ids to these
737 /// nodes.
738 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
739
740 /// Maps to track the calls to their corresponding nodes in the graph.
741 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
742 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
743
744 /// Owner of all ContextNode unique_ptrs.
745 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
746
747 /// Perform sanity checks on graph when requested.
748 void check() const;
749
750 /// Keeps track of the last unique context id assigned.
751 unsigned int LastContextId = 0;
752};
753
754template <typename DerivedCCG, typename FuncTy, typename CallTy>
755using ContextNode =
756 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
757template <typename DerivedCCG, typename FuncTy, typename CallTy>
758using ContextEdge =
759 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
760template <typename DerivedCCG, typename FuncTy, typename CallTy>
761using FuncInfo =
762 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
763template <typename DerivedCCG, typename FuncTy, typename CallTy>
764using CallInfo =
765 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
766
767/// CRTP derived class for graphs built from IR (regular LTO).
768class ModuleCallsiteContextGraph
769 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
770 Instruction *> {
771public:
772 ModuleCallsiteContextGraph(
773 Module &M,
775
776private:
777 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
778 Instruction *>;
779
780 uint64_t getStackId(uint64_t IdOrIndex) const;
781 const Function *getCalleeFunc(Instruction *Call);
782 bool calleeMatchesFunc(
783 Instruction *Call, const Function *Func, const Function *CallerFunc,
784 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
785 bool sameCallee(Instruction *Call1, Instruction *Call2);
786 bool findProfiledCalleeThroughTailCalls(
787 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
788 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
789 bool &FoundMultipleCalleeChains);
790 uint64_t getLastStackId(Instruction *Call);
791 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
792 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
793 AllocationType getAllocationCallType(const CallInfo &Call) const;
794 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
795 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
796 Instruction *>::FuncInfo
797 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
798 std::map<CallInfo, CallInfo> &CallMap,
799 std::vector<CallInfo> &CallsWithMetadataInFunc,
800 unsigned CloneNo);
801 std::string getLabel(const Function *Func, const Instruction *Call,
802 unsigned CloneNo) const;
803
804 const Module &Mod;
806};
807
808/// Represents a call in the summary index graph, which can either be an
809/// allocation or an interior callsite node in an allocation's context.
810/// Holds a pointer to the corresponding data structure in the index.
811struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
812 IndexCall() : PointerUnion() {}
813 IndexCall(std::nullptr_t) : IndexCall() {}
815 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
816 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
817
818 IndexCall *operator->() { return this; }
819
820 void print(raw_ostream &OS) const {
822 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Base)) {
823 OS << *AI;
824 } else {
825 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Base);
826 assert(CI);
827 OS << *CI;
828 }
829 }
830};
831} // namespace
832
833namespace llvm {
834template <> struct simplify_type<IndexCall> {
836 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
837};
838template <> struct simplify_type<const IndexCall> {
840 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
841};
842} // namespace llvm
843
844namespace {
845/// CRTP derived class for graphs built from summary index (ThinLTO).
846class IndexCallsiteContextGraph
847 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
848 IndexCall> {
849public:
850 IndexCallsiteContextGraph(
851 ModuleSummaryIndex &Index,
853 isPrevailing);
854
855 ~IndexCallsiteContextGraph() {
856 // Now that we are done with the graph it is safe to add the new
857 // CallsiteInfo structs to the function summary vectors. The graph nodes
858 // point into locations within these vectors, so we don't want to add them
859 // any earlier.
860 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
861 auto *FS = I.first;
862 for (auto &Callsite : I.second)
863 FS->addCallsite(*Callsite.second);
864 }
865 }
866
867private:
868 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
869 IndexCall>;
870
871 uint64_t getStackId(uint64_t IdOrIndex) const;
872 const FunctionSummary *getCalleeFunc(IndexCall &Call);
873 bool calleeMatchesFunc(
874 IndexCall &Call, const FunctionSummary *Func,
875 const FunctionSummary *CallerFunc,
876 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
877 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
878 bool findProfiledCalleeThroughTailCalls(
879 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
880 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
881 bool &FoundMultipleCalleeChains);
882 uint64_t getLastStackId(IndexCall &Call);
883 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
884 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
885 AllocationType getAllocationCallType(const CallInfo &Call) const;
886 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
887 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
888 IndexCall>::FuncInfo
889 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
890 std::map<CallInfo, CallInfo> &CallMap,
891 std::vector<CallInfo> &CallsWithMetadataInFunc,
892 unsigned CloneNo);
893 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
894 unsigned CloneNo) const;
895
896 // Saves mapping from function summaries containing memprof records back to
897 // its VI, for use in checking and debugging.
898 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
899
902 isPrevailing;
903
904 // Saves/owns the callsite info structures synthesized for missing tail call
905 // frames that we discover while building the graph.
906 // It maps from the summary of the function making the tail call, to a map
907 // of callee ValueInfo to corresponding synthesized callsite info.
908 std::unordered_map<FunctionSummary *,
909 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
910 FunctionCalleesToSynthesizedCallsiteInfos;
911};
912} // namespace
913
914namespace llvm {
915template <>
916struct DenseMapInfo<typename CallsiteContextGraph<
917 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
919template <>
920struct DenseMapInfo<typename CallsiteContextGraph<
921 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
922 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
923template <>
924struct DenseMapInfo<IndexCall>
925 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
926} // end namespace llvm
927
928namespace {
929
930// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
931// type we should actually use on the corresponding allocation.
932// If we can't clone a node that has NotCold+Cold alloc type, we will fall
933// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
934// from NotCold.
935AllocationType allocTypeToUse(uint8_t AllocTypes) {
936 assert(AllocTypes != (uint8_t)AllocationType::None);
937 if (AllocTypes ==
938 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
939 return AllocationType::NotCold;
940 else
941 return (AllocationType)AllocTypes;
942}
943
944// Helper to check if the alloc types for all edges recorded in the
945// InAllocTypes vector match the alloc types for all edges in the Edges
946// vector.
947template <typename DerivedCCG, typename FuncTy, typename CallTy>
948bool allocTypesMatch(
949 const std::vector<uint8_t> &InAllocTypes,
950 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
951 &Edges) {
952 // This should be called only when the InAllocTypes vector was computed for
953 // this set of Edges. Make sure the sizes are the same.
954 assert(InAllocTypes.size() == Edges.size());
955 return std::equal(
956 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
957 [](const uint8_t &l,
958 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
959 // Can share if one of the edges is None type - don't
960 // care about the type along that edge as it doesn't
961 // exist for those context ids.
962 if (l == (uint8_t)AllocationType::None ||
963 r->AllocTypes == (uint8_t)AllocationType::None)
964 return true;
965 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
966 });
967}
968
969// Helper to check if the alloc types for all edges recorded in the
970// InAllocTypes vector match the alloc types for callee edges in the given
971// clone. Because the InAllocTypes were computed from the original node's callee
972// edges, and other cloning could have happened after this clone was created, we
973// need to find the matching clone callee edge, which may or may not exist.
974template <typename DerivedCCG, typename FuncTy, typename CallTy>
975bool allocTypesMatchClone(
976 const std::vector<uint8_t> &InAllocTypes,
977 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
978 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
979 assert(Node);
980 // InAllocTypes should have been computed for the original node's callee
981 // edges.
982 assert(InAllocTypes.size() == Node->CalleeEdges.size());
983 // First create a map of the clone callee edge callees to the edge alloc type.
985 EdgeCalleeMap;
986 for (const auto &E : Clone->CalleeEdges) {
987 assert(!EdgeCalleeMap.contains(E->Callee));
988 EdgeCalleeMap[E->Callee] = E->AllocTypes;
989 }
990 // Next, walk the original node's callees, and look for the corresponding
991 // clone edge to that callee.
992 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
993 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
994 // Not found is ok, we will simply add an edge if we use this clone.
995 if (Iter == EdgeCalleeMap.end())
996 continue;
997 // Can share if one of the edges is None type - don't
998 // care about the type along that edge as it doesn't
999 // exist for those context ids.
1000 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1001 Iter->second == (uint8_t)AllocationType::None)
1002 continue;
1003 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1004 return false;
1005 }
1006 return true;
1007}
1008
1009} // end anonymous namespace
1010
1011template <typename DerivedCCG, typename FuncTy, typename CallTy>
1012typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1013CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1014 const CallInfo &C) {
1015 ContextNode *Node = getNodeForAlloc(C);
1016 if (Node)
1017 return Node;
1018
1019 return NonAllocationCallToContextNodeMap.lookup(C);
1020}
1021
1022template <typename DerivedCCG, typename FuncTy, typename CallTy>
1023typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1024CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1025 const CallInfo &C) {
1026 return AllocationCallToContextNodeMap.lookup(C);
1027}
1028
1029template <typename DerivedCCG, typename FuncTy, typename CallTy>
1030typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1031CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1032 uint64_t StackId) {
1033 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1034 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1035 return StackEntryNode->second;
1036 return nullptr;
1037}
1038
1039template <typename DerivedCCG, typename FuncTy, typename CallTy>
1040void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1041 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1042 unsigned int ContextId) {
1043 for (auto &Edge : CallerEdges) {
1044 if (Edge->Caller == Caller) {
1045 Edge->AllocTypes |= (uint8_t)AllocType;
1046 Edge->getContextIds().insert(ContextId);
1047 return;
1048 }
1049 }
1050 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1051 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1052 CallerEdges.push_back(Edge);
1053 Caller->CalleeEdges.push_back(Edge);
1054}
1055
1056template <typename DerivedCCG, typename FuncTy, typename CallTy>
1057void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1058 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1059 assert(!EI || (*EI)->get() == Edge);
1060 // Save the Caller and Callee pointers so we can erase Edge from their edge
1061 // lists after clearing Edge below. We do the clearing first in case it is
1062 // destructed after removing from the edge lists (if those were the last
1063 // shared_ptr references to Edge).
1064 auto *Callee = Edge->Callee;
1065 auto *Caller = Edge->Caller;
1066
1067 // Make sure the edge fields are cleared out so we can properly detect
1068 // removed edges if Edge is not destructed because there is still a shared_ptr
1069 // reference.
1070 Edge->clear();
1071
1072 if (!EI) {
1073 Callee->eraseCallerEdge(Edge);
1074 Caller->eraseCalleeEdge(Edge);
1075 } else if (CalleeIter) {
1076 Callee->eraseCallerEdge(Edge);
1077 *EI = Caller->CalleeEdges.erase(*EI);
1078 } else {
1079 Caller->eraseCalleeEdge(Edge);
1080 *EI = Callee->CallerEdges.erase(*EI);
1081 }
1082}
1083
1084template <typename DerivedCCG, typename FuncTy, typename CallTy>
1085void CallsiteContextGraph<
1086 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1087 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1088 auto Edge = *EI;
1089 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1090 assert(Edge->ContextIds.empty());
1091 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1092 } else
1093 ++EI;
1094 }
1095}
1096
1097template <typename DerivedCCG, typename FuncTy, typename CallTy>
1098void CallsiteContextGraph<
1099 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1100 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1101 auto Edge = *EI;
1102 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1103 assert(Edge->ContextIds.empty());
1104 Edge->Caller->eraseCalleeEdge(Edge.get());
1105 EI = Node->CallerEdges.erase(EI);
1106 } else
1107 ++EI;
1108 }
1109}
1110
1111template <typename DerivedCCG, typename FuncTy, typename CallTy>
1112typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1113CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1114 findEdgeFromCallee(const ContextNode *Callee) {
1115 for (const auto &Edge : CalleeEdges)
1116 if (Edge->Callee == Callee)
1117 return Edge.get();
1118 return nullptr;
1119}
1120
1121template <typename DerivedCCG, typename FuncTy, typename CallTy>
1122typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1123CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1124 findEdgeFromCaller(const ContextNode *Caller) {
1125 for (const auto &Edge : CallerEdges)
1126 if (Edge->Caller == Caller)
1127 return Edge.get();
1128 return nullptr;
1129}
1130
1131template <typename DerivedCCG, typename FuncTy, typename CallTy>
1132void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1133 eraseCalleeEdge(const ContextEdge *Edge) {
1134 auto EI = llvm::find_if(
1135 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1136 return CalleeEdge.get() == Edge;
1137 });
1138 assert(EI != CalleeEdges.end());
1139 CalleeEdges.erase(EI);
1140}
1141
1142template <typename DerivedCCG, typename FuncTy, typename CallTy>
1143void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1144 eraseCallerEdge(const ContextEdge *Edge) {
1145 auto EI = llvm::find_if(
1146 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1147 return CallerEdge.get() == Edge;
1148 });
1149 assert(EI != CallerEdges.end());
1150 CallerEdges.erase(EI);
1151}
1152
1153template <typename DerivedCCG, typename FuncTy, typename CallTy>
1154uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1155 DenseSet<uint32_t> &ContextIds) {
1156 uint8_t BothTypes =
1157 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1158 uint8_t AllocType = (uint8_t)AllocationType::None;
1159 for (auto Id : ContextIds) {
1160 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
1161 // Bail early if alloc type reached both, no further refinement.
1162 if (AllocType == BothTypes)
1163 return AllocType;
1164 }
1165 return AllocType;
1166}
1167
1168template <typename DerivedCCG, typename FuncTy, typename CallTy>
1169uint8_t
1170CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1171 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
1172 uint8_t BothTypes =
1173 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1174 uint8_t AllocType = (uint8_t)AllocationType::None;
1175 for (auto Id : Node1Ids) {
1176 if (!Node2Ids.count(Id))
1177 continue;
1178 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
1179 // Bail early if alloc type reached both, no further refinement.
1180 if (AllocType == BothTypes)
1181 return AllocType;
1182 }
1183 return AllocType;
1184}
1185
1186template <typename DerivedCCG, typename FuncTy, typename CallTy>
1187uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1188 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
1189 if (Node1Ids.size() < Node2Ids.size())
1190 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1191 else
1192 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1193}
1194
1195template <typename DerivedCCG, typename FuncTy, typename CallTy>
1196typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1197CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1198 CallInfo Call, const FuncTy *F) {
1199 assert(!getNodeForAlloc(Call));
1200 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1201 AllocationCallToContextNodeMap[Call] = AllocNode;
1202 // Use LastContextId as a uniq id for MIB allocation nodes.
1203 AllocNode->OrigStackOrAllocId = LastContextId;
1204 // Alloc type should be updated as we add in the MIBs. We should assert
1205 // afterwards that it is not still None.
1206 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1207
1208 return AllocNode;
1209}
1210
1211static std::string getAllocTypeString(uint8_t AllocTypes) {
1212 if (!AllocTypes)
1213 return "None";
1214 std::string Str;
1215 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1216 Str += "NotCold";
1217 if (AllocTypes & (uint8_t)AllocationType::Cold)
1218 Str += "Cold";
1219 return Str;
1220}
1221
1222template <typename DerivedCCG, typename FuncTy, typename CallTy>
1223template <class NodeT, class IteratorT>
1224void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1225 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1227 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1228 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1229 // is done.
1230 if (AllocType == AllocationType::Hot)
1231 AllocType = AllocationType::NotCold;
1232
1233 ContextIdToAllocationType[++LastContextId] = AllocType;
1234
1235 if (!ContextSizeInfo.empty()) {
1236 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1237 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1238 }
1239
1240 // Update alloc type and context ids for this MIB.
1241 AllocNode->AllocTypes |= (uint8_t)AllocType;
1242
1243 // Now add or update nodes for each stack id in alloc's context.
1244 // Later when processing the stack ids on non-alloc callsites we will adjust
1245 // for any inlining in the context.
1246 ContextNode *PrevNode = AllocNode;
1247 // Look for recursion (direct recursion should have been collapsed by
1248 // module summary analysis, here we should just be detecting mutual
1249 // recursion). Mark these nodes so we don't try to clone.
1250 SmallSet<uint64_t, 8> StackIdSet;
1251 // Skip any on the allocation call (inlining).
1252 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1253 ContextIter != StackContext.end(); ++ContextIter) {
1254 auto StackId = getStackId(*ContextIter);
1255 ContextNode *StackNode = getNodeForStackId(StackId);
1256 if (!StackNode) {
1257 StackNode = createNewNode(/*IsAllocation=*/false);
1258 StackEntryIdToContextNodeMap[StackId] = StackNode;
1259 StackNode->OrigStackOrAllocId = StackId;
1260 }
1261 // Marking a node recursive will prevent its cloning completely, even for
1262 // non-recursive contexts flowing through it.
1264 auto Ins = StackIdSet.insert(StackId);
1265 if (!Ins.second)
1266 StackNode->Recursive = true;
1267 }
1268 StackNode->AllocTypes |= (uint8_t)AllocType;
1269 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1270 PrevNode = StackNode;
1271 }
1272}
1273
1274template <typename DerivedCCG, typename FuncTy, typename CallTy>
1276CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1277 const DenseSet<uint32_t> &StackSequenceContextIds,
1278 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1279 DenseSet<uint32_t> NewContextIds;
1280 for (auto OldId : StackSequenceContextIds) {
1281 NewContextIds.insert(++LastContextId);
1282 OldToNewContextIds[OldId].insert(LastContextId);
1283 assert(ContextIdToAllocationType.count(OldId));
1284 // The new context has the same allocation type as original.
1285 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1286 }
1287 return NewContextIds;
1288}
1289
1290template <typename DerivedCCG, typename FuncTy, typename CallTy>
1291void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1292 propagateDuplicateContextIds(
1293 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1294 // Build a set of duplicated context ids corresponding to the input id set.
1295 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1296 DenseSet<uint32_t> NewIds;
1297 for (auto Id : ContextIds)
1298 if (auto NewId = OldToNewContextIds.find(Id);
1299 NewId != OldToNewContextIds.end())
1300 NewIds.insert(NewId->second.begin(), NewId->second.end());
1301 return NewIds;
1302 };
1303
1304 // Recursively update context ids sets along caller edges.
1305 auto UpdateCallers = [&](ContextNode *Node,
1307 auto &&UpdateCallers) -> void {
1308 for (const auto &Edge : Node->CallerEdges) {
1309 auto Inserted = Visited.insert(Edge.get());
1310 if (!Inserted.second)
1311 continue;
1312 ContextNode *NextNode = Edge->Caller;
1313 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1314 // Only need to recursively iterate to NextNode via this caller edge if
1315 // it resulted in any added ids to NextNode.
1316 if (!NewIdsToAdd.empty()) {
1317 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1318 UpdateCallers(NextNode, Visited, UpdateCallers);
1319 }
1320 }
1321 };
1322
1324 for (auto &Entry : AllocationCallToContextNodeMap) {
1325 auto *Node = Entry.second;
1326 UpdateCallers(Node, Visited, UpdateCallers);
1327 }
1328}
1329
1330template <typename DerivedCCG, typename FuncTy, typename CallTy>
1331void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1332 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1333 // This must be passed by value to make a copy since it will be adjusted
1334 // as ids are moved.
1335 DenseSet<uint32_t> RemainingContextIds) {
1336 auto &OrigEdges =
1337 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1338 // Increment iterator in loop so that we can remove edges as needed.
1339 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1340 auto Edge = *EI;
1341 // Remove any matching context ids from Edge, return set that were found and
1342 // removed, these are the new edge's context ids. Also update the remaining
1343 // (not found ids).
1344 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1345 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1346 NotFoundContextIds);
1347 RemainingContextIds.swap(NotFoundContextIds);
1348 // If no matching context ids for this edge, skip it.
1349 if (NewEdgeContextIds.empty()) {
1350 ++EI;
1351 continue;
1352 }
1353 if (TowardsCallee) {
1354 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1355 auto NewEdge = std::make_shared<ContextEdge>(
1356 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1357 NewNode->CalleeEdges.push_back(NewEdge);
1358 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1359 } else {
1360 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1361 auto NewEdge = std::make_shared<ContextEdge>(
1362 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1363 NewNode->CallerEdges.push_back(NewEdge);
1364 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1365 }
1366 // Remove old edge if context ids empty.
1367 if (Edge->getContextIds().empty()) {
1368 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1369 continue;
1370 }
1371 ++EI;
1372 }
1373}
1374
1375template <typename DerivedCCG, typename FuncTy, typename CallTy>
1376static void checkEdge(
1377 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1378 // Confirm that alloc type is not None and that we have at least one context
1379 // id.
1380 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1381 assert(!Edge->ContextIds.empty());
1382}
1383
1384template <typename DerivedCCG, typename FuncTy, typename CallTy>
1385static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1386 bool CheckEdges = true) {
1387 if (Node->isRemoved())
1388 return;
1389#ifndef NDEBUG
1390 // Compute node's context ids once for use in asserts.
1391 auto NodeContextIds = Node->getContextIds();
1392#endif
1393 // Node's context ids should be the union of both its callee and caller edge
1394 // context ids.
1395 if (Node->CallerEdges.size()) {
1396 DenseSet<uint32_t> CallerEdgeContextIds(
1397 Node->CallerEdges.front()->ContextIds);
1398 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1399 if (CheckEdges)
1400 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1401 set_union(CallerEdgeContextIds, Edge->ContextIds);
1402 }
1403 // Node can have more context ids than callers if some contexts terminate at
1404 // node and some are longer. If we are allowing recursive callsites but
1405 // haven't disabled recursive contexts, this will be violated for
1406 // incompletely cloned recursive cycles, so skip the checking in that case.
1408 NodeContextIds == CallerEdgeContextIds ||
1409 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1410 }
1411 if (Node->CalleeEdges.size()) {
1412 DenseSet<uint32_t> CalleeEdgeContextIds(
1413 Node->CalleeEdges.front()->ContextIds);
1414 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1415 if (CheckEdges)
1416 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1417 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1418 }
1419 assert(NodeContextIds == CalleeEdgeContextIds);
1420 }
1421 // FIXME: Since this checking is only invoked under an option, we should
1422 // change the error checking from using assert to something that will trigger
1423 // an error on a release build.
1424#ifndef NDEBUG
1425 // Make sure we don't end up with duplicate edges between the same caller and
1426 // callee.
1428 for (const auto &E : Node->CalleeEdges)
1429 NodeSet.insert(E->Callee);
1430 assert(NodeSet.size() == Node->CalleeEdges.size());
1431#endif
1432}
1433
1434template <typename DerivedCCG, typename FuncTy, typename CallTy>
1435void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1436 assignStackNodesPostOrder(
1437 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1438 DenseMap<uint64_t, std::vector<CallContextInfo>>
1439 &StackIdToMatchingCalls,
1440 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1441 auto Inserted = Visited.insert(Node);
1442 if (!Inserted.second)
1443 return;
1444 // Post order traversal. Iterate over a copy since we may add nodes and
1445 // therefore new callers during the recursive call, invalidating any
1446 // iterator over the original edge vector. We don't need to process these
1447 // new nodes as they were already processed on creation.
1448 auto CallerEdges = Node->CallerEdges;
1449 for (auto &Edge : CallerEdges) {
1450 // Skip any that have been removed during the recursion.
1451 if (Edge->isRemoved()) {
1452 assert(!is_contained(Node->CallerEdges, Edge));
1453 continue;
1454 }
1455 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1456 CallToMatchingCall);
1457 }
1458
1459 // If this node's stack id is in the map, update the graph to contain new
1460 // nodes representing any inlining at interior callsites. Note we move the
1461 // associated context ids over to the new nodes.
1462
1463 // Ignore this node if it is for an allocation or we didn't record any
1464 // stack id lists ending at it.
1465 if (Node->IsAllocation ||
1466 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1467 return;
1468
1469 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1470 // Handle the simple case first. A single call with a single stack id.
1471 // In this case there is no need to create any new context nodes, simply
1472 // assign the context node for stack id to this Call.
1473 if (Calls.size() == 1) {
1474 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1475 if (Ids.size() == 1) {
1476 assert(SavedContextIds.empty());
1477 // It should be this Node
1478 assert(Node == getNodeForStackId(Ids[0]));
1479 if (Node->Recursive)
1480 return;
1481 Node->setCall(Call);
1482 NonAllocationCallToContextNodeMap[Call] = Node;
1483 NodeToCallingFunc[Node] = Func;
1484 return;
1485 }
1486 }
1487
1488#ifndef NDEBUG
1489 // Find the node for the last stack id, which should be the same
1490 // across all calls recorded for this id, and is this node's id.
1491 uint64_t LastId = Node->OrigStackOrAllocId;
1492 ContextNode *LastNode = getNodeForStackId(LastId);
1493 // We should only have kept stack ids that had nodes.
1494 assert(LastNode);
1495 assert(LastNode == Node);
1496#else
1497 ContextNode *LastNode = Node;
1498#endif
1499
1500 // Compute the last node's context ids once, as it is shared by all calls in
1501 // this entry.
1502 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1503
1504 bool PrevIterCreatedNode = false;
1505 bool CreatedNode = false;
1506 for (unsigned I = 0; I < Calls.size();
1507 I++, PrevIterCreatedNode = CreatedNode) {
1508 CreatedNode = false;
1509 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1510 // Skip any for which we didn't assign any ids, these don't get a node in
1511 // the graph.
1512 if (SavedContextIds.empty()) {
1513 // If this call has a matching call (located in the same function and
1514 // having the same stack ids), simply add it to the context node created
1515 // for its matching call earlier. These can be treated the same through
1516 // cloning and get updated at the same time.
1517 if (!CallToMatchingCall.contains(Call))
1518 continue;
1519 auto MatchingCall = CallToMatchingCall[Call];
1520 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1521 // This should only happen if we had a prior iteration, and it didn't
1522 // create a node because of the below recomputation of context ids
1523 // finding none remaining and continuing early.
1524 assert(I > 0 && !PrevIterCreatedNode);
1525 continue;
1526 }
1527 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1528 Call);
1529 continue;
1530 }
1531
1532 assert(LastId == Ids.back());
1533
1534 // Recompute the context ids for this stack id sequence (the
1535 // intersection of the context ids of the corresponding nodes).
1536 // Start with the ids we saved in the map for this call, which could be
1537 // duplicated context ids. We have to recompute as we might have overlap
1538 // overlap between the saved context ids for different last nodes, and
1539 // removed them already during the post order traversal.
1540 set_intersect(SavedContextIds, LastNodeContextIds);
1541 ContextNode *PrevNode = LastNode;
1542 bool Skip = false;
1543 // Iterate backwards through the stack Ids, starting after the last Id
1544 // in the list, which was handled once outside for all Calls.
1545 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1546 auto Id = *IdIter;
1547 ContextNode *CurNode = getNodeForStackId(Id);
1548 // We should only have kept stack ids that had nodes and weren't
1549 // recursive.
1550 assert(CurNode);
1551 assert(!CurNode->Recursive);
1552
1553 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1554 if (!Edge) {
1555 Skip = true;
1556 break;
1557 }
1558 PrevNode = CurNode;
1559
1560 // Update the context ids, which is the intersection of the ids along
1561 // all edges in the sequence.
1562 set_intersect(SavedContextIds, Edge->getContextIds());
1563
1564 // If we now have no context ids for clone, skip this call.
1565 if (SavedContextIds.empty()) {
1566 Skip = true;
1567 break;
1568 }
1569 }
1570 if (Skip)
1571 continue;
1572
1573 // Create new context node.
1574 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1575 NonAllocationCallToContextNodeMap[Call] = NewNode;
1576 CreatedNode = true;
1577 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1578
1579 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1580 assert(FirstNode);
1581
1582 // Connect to callees of innermost stack frame in inlined call chain.
1583 // This updates context ids for FirstNode's callee's to reflect those
1584 // moved to NewNode.
1585 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1586
1587 // Connect to callers of outermost stack frame in inlined call chain.
1588 // This updates context ids for FirstNode's caller's to reflect those
1589 // moved to NewNode.
1590 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1591
1592 // Now we need to remove context ids from edges/nodes between First and
1593 // Last Node.
1594 PrevNode = nullptr;
1595 for (auto Id : Ids) {
1596 ContextNode *CurNode = getNodeForStackId(Id);
1597 // We should only have kept stack ids that had nodes.
1598 assert(CurNode);
1599
1600 // Remove the context ids moved to NewNode from CurNode, and the
1601 // edge from the prior node.
1602 if (PrevNode) {
1603 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1604 assert(PrevEdge);
1605 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1606 if (PrevEdge->getContextIds().empty())
1607 removeEdgeFromGraph(PrevEdge);
1608 }
1609 // Since we update the edges from leaf to tail, only look at the callee
1610 // edges. This isn't an alloc node, so if there are no callee edges, the
1611 // alloc type is None.
1612 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1613 ? (uint8_t)AllocationType::None
1614 : CurNode->computeAllocType();
1615 PrevNode = CurNode;
1616 }
1617 if (VerifyNodes) {
1618 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1619 for (auto Id : Ids) {
1620 ContextNode *CurNode = getNodeForStackId(Id);
1621 // We should only have kept stack ids that had nodes.
1622 assert(CurNode);
1623 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1624 }
1625 }
1626 }
1627}
1628
1629template <typename DerivedCCG, typename FuncTy, typename CallTy>
1630void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1631 // Map of stack id to all calls with that as the last (outermost caller)
1632 // callsite id that has a context node (some might not due to pruning
1633 // performed during matching of the allocation profile contexts).
1634 // The CallContextInfo contains the Call and a list of its stack ids with
1635 // ContextNodes, the function containing Call, and the set of context ids
1636 // the analysis will eventually identify for use in any new node created
1637 // for that callsite.
1639 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1640 for (auto &Call : CallsWithMetadata) {
1641 // Ignore allocations, already handled.
1642 if (AllocationCallToContextNodeMap.count(Call))
1643 continue;
1644 auto StackIdsWithContextNodes =
1645 getStackIdsWithContextNodesForCall(Call.call());
1646 // If there were no nodes created for MIBs on allocs (maybe this was in
1647 // the unambiguous part of the MIB stack that was pruned), ignore.
1648 if (StackIdsWithContextNodes.empty())
1649 continue;
1650 // Otherwise, record this Call along with the list of ids for the last
1651 // (outermost caller) stack id with a node.
1652 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1653 {Call.call(), StackIdsWithContextNodes, Func, {}});
1654 }
1655 }
1656
1657 // First make a pass through all stack ids that correspond to a call,
1658 // as identified in the above loop. Compute the context ids corresponding to
1659 // each of these calls when they correspond to multiple stack ids due to
1660 // due to inlining. Perform any duplication of context ids required when
1661 // there is more than one call with the same stack ids. Their (possibly newly
1662 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1663 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1664 // Save a map from each call to any that are found to match it. I.e. located
1665 // in the same function and have the same (possibly pruned) stack ids. We use
1666 // this to avoid creating extra graph nodes as they can be treated the same.
1667 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1668 for (auto &It : StackIdToMatchingCalls) {
1669 auto &Calls = It.getSecond();
1670 // Skip single calls with a single stack id. These don't need a new node.
1671 if (Calls.size() == 1) {
1672 auto &Ids = Calls[0].StackIds;
1673 if (Ids.size() == 1)
1674 continue;
1675 }
1676 // In order to do the best and maximal matching of inlined calls to context
1677 // node sequences we will sort the vectors of stack ids in descending order
1678 // of length, and within each length, lexicographically by stack id. The
1679 // latter is so that we can specially handle calls that have identical stack
1680 // id sequences (either due to cloning or artificially because of the MIB
1681 // context pruning). Those with the same Ids are then sorted by function to
1682 // facilitate efficiently mapping them to the same context node.
1683 // Because the functions are pointers, to ensure a stable sort first assign
1684 // each function pointer to its first index in the Calls array, and then use
1685 // that to sort by.
1687 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1688 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1689 std::stable_sort(
1690 Calls.begin(), Calls.end(),
1691 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1692 return A.StackIds.size() > B.StackIds.size() ||
1693 (A.StackIds.size() == B.StackIds.size() &&
1694 (A.StackIds < B.StackIds ||
1695 (A.StackIds == B.StackIds &&
1696 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1697 });
1698
1699 // Find the node for the last stack id, which should be the same
1700 // across all calls recorded for this id, and is the id for this
1701 // entry in the StackIdToMatchingCalls map.
1702 uint64_t LastId = It.getFirst();
1703 ContextNode *LastNode = getNodeForStackId(LastId);
1704 // We should only have kept stack ids that had nodes.
1705 assert(LastNode);
1706
1707 if (LastNode->Recursive)
1708 continue;
1709
1710 // Initialize the context ids with the last node's. We will subsequently
1711 // refine the context ids by computing the intersection along all edges.
1712 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1713 assert(!LastNodeContextIds.empty());
1714
1715#ifndef NDEBUG
1716 // Save the set of functions seen for a particular set of the same stack
1717 // ids. This is used to ensure that they have been correctly sorted to be
1718 // adjacent in the Calls list, since we rely on that to efficiently place
1719 // all such matching calls onto the same context node.
1720 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1721#endif
1722
1723 for (unsigned I = 0; I < Calls.size(); I++) {
1724 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1725 assert(SavedContextIds.empty());
1726 assert(LastId == Ids.back());
1727
1728#ifndef NDEBUG
1729 // If this call has a different set of ids than the last one, clear the
1730 // set used to ensure they are sorted properly.
1731 if (I > 0 && Ids != Calls[I - 1].StackIds)
1732 MatchingIdsFuncSet.clear();
1733#endif
1734
1735 // First compute the context ids for this stack id sequence (the
1736 // intersection of the context ids of the corresponding nodes).
1737 // Start with the remaining saved ids for the last node.
1738 assert(!LastNodeContextIds.empty());
1739 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1740
1741 ContextNode *PrevNode = LastNode;
1742 ContextNode *CurNode = LastNode;
1743 bool Skip = false;
1744
1745 // Iterate backwards through the stack Ids, starting after the last Id
1746 // in the list, which was handled once outside for all Calls.
1747 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1748 auto Id = *IdIter;
1749 CurNode = getNodeForStackId(Id);
1750 // We should only have kept stack ids that had nodes.
1751 assert(CurNode);
1752
1753 if (CurNode->Recursive) {
1754 Skip = true;
1755 break;
1756 }
1757
1758 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1759 // If there is no edge then the nodes belong to different MIB contexts,
1760 // and we should skip this inlined context sequence. For example, this
1761 // particular inlined context may include stack ids A->B, and we may
1762 // indeed have nodes for both A and B, but it is possible that they were
1763 // never profiled in sequence in a single MIB for any allocation (i.e.
1764 // we might have profiled an allocation that involves the callsite A,
1765 // but through a different one of its callee callsites, and we might
1766 // have profiled an allocation that involves callsite B, but reached
1767 // from a different caller callsite).
1768 if (!Edge) {
1769 Skip = true;
1770 break;
1771 }
1772 PrevNode = CurNode;
1773
1774 // Update the context ids, which is the intersection of the ids along
1775 // all edges in the sequence.
1776 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1777
1778 // If we now have no context ids for clone, skip this call.
1779 if (StackSequenceContextIds.empty()) {
1780 Skip = true;
1781 break;
1782 }
1783 }
1784 if (Skip)
1785 continue;
1786
1787 // If some of this call's stack ids did not have corresponding nodes (due
1788 // to pruning), don't include any context ids for contexts that extend
1789 // beyond these nodes. Otherwise we would be matching part of unrelated /
1790 // not fully matching stack contexts. To do this, subtract any context ids
1791 // found in caller nodes of the last node found above.
1792 if (Ids.back() != getLastStackId(Call)) {
1793 for (const auto &PE : LastNode->CallerEdges) {
1794 set_subtract(StackSequenceContextIds, PE->getContextIds());
1795 if (StackSequenceContextIds.empty())
1796 break;
1797 }
1798 // If we now have no context ids for clone, skip this call.
1799 if (StackSequenceContextIds.empty())
1800 continue;
1801 }
1802
1803#ifndef NDEBUG
1804 // If the prior call had the same stack ids this set would not be empty.
1805 // Check if we already have a call that "matches" because it is located
1806 // in the same function. If the Calls list was sorted properly we should
1807 // not encounter this situation as all such entries should be adjacent
1808 // and processed in bulk further below.
1809 assert(!MatchingIdsFuncSet.contains(Func));
1810
1811 MatchingIdsFuncSet.insert(Func);
1812#endif
1813
1814 // Check if the next set of stack ids is the same (since the Calls vector
1815 // of tuples is sorted by the stack ids we can just look at the next one).
1816 // If so, save them in the CallToMatchingCall map so that they get
1817 // assigned to the same context node, and skip them.
1818 bool DuplicateContextIds = false;
1819 for (unsigned J = I + 1; J < Calls.size(); J++) {
1820 auto &CallCtxInfo = Calls[J];
1821 auto &NextIds = CallCtxInfo.StackIds;
1822 if (NextIds != Ids)
1823 break;
1824 auto *NextFunc = CallCtxInfo.Func;
1825 if (NextFunc != Func) {
1826 // We have another Call with the same ids but that cannot share this
1827 // node, must duplicate ids for it.
1828 DuplicateContextIds = true;
1829 break;
1830 }
1831 auto &NextCall = CallCtxInfo.Call;
1832 CallToMatchingCall[NextCall] = Call;
1833 // Update I so that it gets incremented correctly to skip this call.
1834 I = J;
1835 }
1836
1837 // If we don't have duplicate context ids, then we can assign all the
1838 // context ids computed for the original node sequence to this call.
1839 // If there are duplicate calls with the same stack ids then we synthesize
1840 // new context ids that are duplicates of the originals. These are
1841 // assigned to SavedContextIds, which is a reference into the map entry
1842 // for this call, allowing us to access these ids later on.
1843 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1844 StackSequenceContextIds.size());
1845 SavedContextIds =
1846 DuplicateContextIds
1847 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1848 : StackSequenceContextIds;
1849 assert(!SavedContextIds.empty());
1850
1851 if (!DuplicateContextIds) {
1852 // Update saved last node's context ids to remove those that are
1853 // assigned to other calls, so that it is ready for the next call at
1854 // this stack id.
1855 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1856 if (LastNodeContextIds.empty())
1857 break;
1858 }
1859 }
1860 }
1861
1862 // Propagate the duplicate context ids over the graph.
1863 propagateDuplicateContextIds(OldToNewContextIds);
1864
1865 if (VerifyCCG)
1866 check();
1867
1868 // Now perform a post-order traversal over the graph, starting with the
1869 // allocation nodes, essentially processing nodes from callers to callees.
1870 // For any that contains an id in the map, update the graph to contain new
1871 // nodes representing any inlining at interior callsites. Note we move the
1872 // associated context ids over to the new nodes.
1874 for (auto &Entry : AllocationCallToContextNodeMap)
1875 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
1876 CallToMatchingCall);
1877 if (VerifyCCG)
1878 check();
1879}
1880
1881uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1883 Call->getMetadata(LLVMContext::MD_callsite));
1884 return CallsiteContext.back();
1885}
1886
1887uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1888 assert(isa<CallsiteInfo *>(Call));
1890 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
1891 // Need to convert index into stack id.
1892 return Index.getStackIdAtIndex(CallsiteContext.back());
1893}
1894
1895static const std::string MemProfCloneSuffix = ".memprof.";
1896
1897static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1898 // We use CloneNo == 0 to refer to the original version, which doesn't get
1899 // renamed with a suffix.
1900 if (!CloneNo)
1901 return Base.str();
1902 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1903}
1904
1905static bool isMemProfClone(const Function &F) {
1906 return F.getName().contains(MemProfCloneSuffix);
1907}
1908
1909std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1910 const Instruction *Call,
1911 unsigned CloneNo) const {
1912 return (Twine(Call->getFunction()->getName()) + " -> " +
1913 cast<CallBase>(Call)->getCalledFunction()->getName())
1914 .str();
1915}
1916
1917std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1918 const IndexCall &Call,
1919 unsigned CloneNo) const {
1920 auto VI = FSToVIMap.find(Func);
1921 assert(VI != FSToVIMap.end());
1922 if (isa<AllocInfo *>(Call))
1923 return (VI->second.name() + " -> alloc").str();
1924 else {
1925 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
1926 return (VI->second.name() + " -> " +
1927 getMemProfFuncName(Callsite->Callee.name(),
1928 Callsite->Clones[CloneNo]))
1929 .str();
1930 }
1931}
1932
1933std::vector<uint64_t>
1934ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1935 Instruction *Call) {
1937 Call->getMetadata(LLVMContext::MD_callsite));
1938 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1939 CallsiteContext);
1940}
1941
1942std::vector<uint64_t>
1943IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1944 assert(isa<CallsiteInfo *>(Call));
1946 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
1947 return getStackIdsWithContextNodes<CallsiteInfo,
1949 CallsiteContext);
1950}
1951
1952template <typename DerivedCCG, typename FuncTy, typename CallTy>
1953template <class NodeT, class IteratorT>
1954std::vector<uint64_t>
1955CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1956 CallStack<NodeT, IteratorT> &CallsiteContext) {
1957 std::vector<uint64_t> StackIds;
1958 for (auto IdOrIndex : CallsiteContext) {
1959 auto StackId = getStackId(IdOrIndex);
1960 ContextNode *Node = getNodeForStackId(StackId);
1961 if (!Node)
1962 break;
1963 StackIds.push_back(StackId);
1964 }
1965 return StackIds;
1966}
1967
1968ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1969 Module &M,
1971 : Mod(M), OREGetter(OREGetter) {
1972 for (auto &F : M) {
1973 std::vector<CallInfo> CallsWithMetadata;
1974 for (auto &BB : F) {
1975 for (auto &I : BB) {
1976 if (!isa<CallBase>(I))
1977 continue;
1978 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1979 CallsWithMetadata.push_back(&I);
1980 auto *AllocNode = addAllocNode(&I, &F);
1981 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1982 assert(CallsiteMD);
1983 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1984 // Add all of the MIBs and their stack nodes.
1985 for (auto &MDOp : MemProfMD->operands()) {
1986 auto *MIBMD = cast<const MDNode>(MDOp);
1987 std::vector<ContextTotalSize> ContextSizeInfo;
1988 // Collect the context size information if it exists.
1989 if (MIBMD->getNumOperands() > 2) {
1990 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
1991 MDNode *ContextSizePair =
1992 dyn_cast<MDNode>(MIBMD->getOperand(I));
1993 assert(ContextSizePair->getNumOperands() == 2);
1994 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
1995 ContextSizePair->getOperand(0))
1996 ->getZExtValue();
1997 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
1998 ContextSizePair->getOperand(1))
1999 ->getZExtValue();
2000 ContextSizeInfo.push_back({FullStackId, TotalSize});
2001 }
2002 }
2006 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2007 AllocNode, StackContext, CallsiteContext,
2008 getMIBAllocType(MIBMD), ContextSizeInfo);
2009 }
2010 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2011 // Memprof and callsite metadata on memory allocations no longer
2012 // needed.
2013 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2014 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2015 }
2016 // For callsite metadata, add to list for this function for later use.
2017 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2018 CallsWithMetadata.push_back(&I);
2019 }
2020 }
2021 }
2022 if (!CallsWithMetadata.empty())
2023 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2024 }
2025
2026 if (DumpCCG) {
2027 dbgs() << "CCG before updating call stack chains:\n";
2028 dbgs() << *this;
2029 }
2030
2031 if (ExportToDot)
2032 exportToDot("prestackupdate");
2033
2034 updateStackNodes();
2035
2036 handleCallsitesWithMultipleTargets();
2037
2038 // Strip off remaining callsite metadata, no longer needed.
2039 for (auto &FuncEntry : FuncToCallsWithMetadata)
2040 for (auto &Call : FuncEntry.second)
2041 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2042}
2043
2044IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2045 ModuleSummaryIndex &Index,
2047 isPrevailing)
2048 : Index(Index), isPrevailing(isPrevailing) {
2049 for (auto &I : Index) {
2050 auto VI = Index.getValueInfo(I);
2051 for (auto &S : VI.getSummaryList()) {
2052 // We should only add the prevailing nodes. Otherwise we may try to clone
2053 // in a weak copy that won't be linked (and may be different than the
2054 // prevailing version).
2055 // We only keep the memprof summary on the prevailing copy now when
2056 // building the combined index, as a space optimization, however don't
2057 // rely on this optimization. The linker doesn't resolve local linkage
2058 // values so don't check whether those are prevailing.
2059 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2060 !isPrevailing(VI.getGUID(), S.get()))
2061 continue;
2062 auto *FS = dyn_cast<FunctionSummary>(S.get());
2063 if (!FS)
2064 continue;
2065 std::vector<CallInfo> CallsWithMetadata;
2066 if (!FS->allocs().empty()) {
2067 for (auto &AN : FS->mutableAllocs()) {
2068 // This can happen because of recursion elimination handling that
2069 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2070 // We still added them to the summary because we need to be able to
2071 // correlate properly in applyImport in the backends.
2072 if (AN.MIBs.empty())
2073 continue;
2074 IndexCall AllocCall(&AN);
2075 CallsWithMetadata.push_back(AllocCall);
2076 auto *AllocNode = addAllocNode(AllocCall, FS);
2077 // Pass an empty CallStack to the CallsiteContext (second)
2078 // parameter, since for ThinLTO we already collapsed out the inlined
2079 // stack ids on the allocation call during ModuleSummaryAnalysis.
2081 EmptyContext;
2082 unsigned I = 0;
2083 assert(
2085 AN.ContextSizeInfos.size() == AN.MIBs.size());
2086 // Now add all of the MIBs and their stack nodes.
2087 for (auto &MIB : AN.MIBs) {
2089 StackContext(&MIB);
2090 std::vector<ContextTotalSize> ContextSizeInfo;
2091 if (!AN.ContextSizeInfos.empty()) {
2092 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2093 ContextSizeInfo.push_back({FullStackId, TotalSize});
2094 }
2095 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2096 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2097 ContextSizeInfo);
2098 I++;
2099 }
2100 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2101 // Initialize version 0 on the summary alloc node to the current alloc
2102 // type, unless it has both types in which case make it default, so
2103 // that in the case where we aren't able to clone the original version
2104 // always ends up with the default allocation behavior.
2105 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2106 }
2107 }
2108 // For callsite metadata, add to list for this function for later use.
2109 if (!FS->callsites().empty())
2110 for (auto &SN : FS->mutableCallsites()) {
2111 IndexCall StackNodeCall(&SN);
2112 CallsWithMetadata.push_back(StackNodeCall);
2113 }
2114
2115 if (!CallsWithMetadata.empty())
2116 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2117
2118 if (!FS->allocs().empty() || !FS->callsites().empty())
2119 FSToVIMap[FS] = VI;
2120 }
2121 }
2122
2123 if (DumpCCG) {
2124 dbgs() << "CCG before updating call stack chains:\n";
2125 dbgs() << *this;
2126 }
2127
2128 if (ExportToDot)
2129 exportToDot("prestackupdate");
2130
2131 updateStackNodes();
2132
2133 handleCallsitesWithMultipleTargets();
2134}
2135
2136template <typename DerivedCCG, typename FuncTy, typename CallTy>
2137void CallsiteContextGraph<DerivedCCG, FuncTy,
2138 CallTy>::handleCallsitesWithMultipleTargets() {
2139 // Look for and workaround callsites that call multiple functions.
2140 // This can happen for indirect calls, which needs better handling, and in
2141 // more rare cases (e.g. macro expansion).
2142 // TODO: To fix this for indirect calls we will want to perform speculative
2143 // devirtualization using either the normal PGO info with ICP, or using the
2144 // information in the profiled MemProf contexts. We can do this prior to
2145 // this transformation for regular LTO, and for ThinLTO we can simulate that
2146 // effect in the summary and perform the actual speculative devirtualization
2147 // while cloning in the ThinLTO backend.
2148
2149 // Keep track of the new nodes synthesized for discovered tail calls missing
2150 // from the profiled contexts.
2151 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2152
2153 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2154 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2155 auto *Node = Entry.second;
2156 assert(Node->Clones.empty());
2157 // Check all node callees and see if in the same function.
2158 // We need to check all of the calls recorded in this Node, because in some
2159 // cases we may have had multiple calls with the same debug info calling
2160 // different callees. This can happen, for example, when an object is
2161 // constructed in the paramter list - the destructor call of the object has
2162 // the same debug info (line/col) as the call the object was passed to.
2163 // Here we will prune any that don't match all callee nodes.
2164 std::vector<CallInfo> AllCalls;
2165 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2166 AllCalls.push_back(Node->Call);
2167 AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(),
2168 Node->MatchingCalls.end());
2169
2170 // First see if we can partition the calls by callee function, creating new
2171 // nodes to host each set of calls calling the same callees. This is
2172 // necessary for support indirect calls with ThinLTO, for which we
2173 // synthesized CallsiteInfo records for each target. They will all have the
2174 // same callsite stack ids and would be sharing a context node at this
2175 // point. We need to perform separate cloning for each, which will be
2176 // applied along with speculative devirtualization in the ThinLTO backends
2177 // as needed. Note this does not currently support looking through tail
2178 // calls, it is unclear if we need that for indirect call targets.
2179 // First partition calls by callee func. Map indexed by func, value is
2180 // struct with list of matching calls, assigned node.
2181 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2182 continue;
2183
2184 auto It = AllCalls.begin();
2185 // Iterate through the calls until we find the first that matches.
2186 for (; It != AllCalls.end(); ++It) {
2187 auto ThisCall = *It;
2188 bool Match = true;
2189 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2190 ++EI) {
2191 auto Edge = *EI;
2192 if (!Edge->Callee->hasCall())
2193 continue;
2194 assert(NodeToCallingFunc.count(Edge->Callee));
2195 // Check if the called function matches that of the callee node.
2196 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2197 Match = false;
2198 break;
2199 }
2200 }
2201 // Found a call that matches the callee nodes, we can quit now.
2202 if (Match) {
2203 // If the first match is not the primary call on the Node, update it
2204 // now. We will update the list of matching calls further below.
2205 if (Node->Call != ThisCall) {
2206 Node->setCall(ThisCall);
2207 // We need to update the NonAllocationCallToContextNodeMap, but don't
2208 // want to do this during iteration over that map, so save the calls
2209 // that need updated entries.
2210 NewCallToNode.push_back({ThisCall, Node});
2211 }
2212 break;
2213 }
2214 }
2215 // We will update this list below (or leave it cleared if there was no
2216 // match found above).
2217 Node->MatchingCalls.clear();
2218 // If we hit the end of the AllCalls vector, no call matching the callee
2219 // nodes was found, clear the call information in the node.
2220 if (It == AllCalls.end()) {
2221 RemovedEdgesWithMismatchedCallees++;
2222 // Work around by setting Node to have a null call, so it gets
2223 // skipped during cloning. Otherwise assignFunctions will assert
2224 // because its data structures are not designed to handle this case.
2225 Node->setCall(CallInfo());
2226 continue;
2227 }
2228 // Now add back any matching calls that call the same function as the
2229 // matching primary call on Node.
2230 for (++It; It != AllCalls.end(); ++It) {
2231 auto ThisCall = *It;
2232 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2233 continue;
2234 Node->MatchingCalls.push_back(ThisCall);
2235 }
2236 }
2237
2238 // Remove all mismatched nodes identified in the above loop from the node map
2239 // (checking whether they have a null call which is set above). For a
2240 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2241 // to do the removal via remove_if than by individually erasing entries above.
2242 // Also remove any entries if we updated the node's primary call above.
2243 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2244 return !it.second->hasCall() || it.second->Call != it.first;
2245 });
2246
2247 // Add entries for any new primary calls recorded above.
2248 for (auto &[Call, Node] : NewCallToNode)
2249 NonAllocationCallToContextNodeMap[Call] = Node;
2250
2251 // Add the new nodes after the above loop so that the iteration is not
2252 // invalidated.
2253 for (auto &[Call, Node] : TailCallToContextNodeMap)
2254 NonAllocationCallToContextNodeMap[Call] = Node;
2255}
2256
2257template <typename DerivedCCG, typename FuncTy, typename CallTy>
2258bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2259 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2260 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2261 // Struct to keep track of all the calls having the same callee function,
2262 // and the node we eventually assign to them. Eventually we will record the
2263 // context node assigned to this group of calls.
2264 struct CallsWithSameCallee {
2265 std::vector<CallInfo> Calls;
2266 ContextNode *Node = nullptr;
2267 };
2268
2269 // First partition calls by callee function. Build map from each function
2270 // to the list of matching calls.
2272 for (auto ThisCall : AllCalls) {
2273 auto *F = getCalleeFunc(ThisCall.call());
2274 if (F)
2275 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2276 }
2277
2278 // Next, walk through all callee edges. For each callee node, get its
2279 // containing function and see if it was recorded in the above map (meaning we
2280 // have at least one matching call). Build another map from each callee node
2281 // with a matching call to the structure instance created above containing all
2282 // the calls.
2284 for (const auto &Edge : Node->CalleeEdges) {
2285 if (!Edge->Callee->hasCall())
2286 continue;
2287 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2288 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2289 CalleeNodeToCallInfo[Edge->Callee] =
2290 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2291 }
2292
2293 // If there are entries in the second map, then there were no matching
2294 // calls/callees, nothing to do here. Return so we can go to the handling that
2295 // looks through tail calls.
2296 if (CalleeNodeToCallInfo.empty())
2297 return false;
2298
2299 // Walk through all callee edges again. Any and all callee edges that didn't
2300 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2301 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2302 // ignored during cloning. If it is in the map, then we use the node recorded
2303 // in that entry (creating it if needed), and move the callee edge to it.
2304 // The first callee will use the original node instead of creating a new one.
2305 // Note that any of the original calls on this node (in AllCalls) that didn't
2306 // have a callee function automatically get dropped from the node as part of
2307 // this process.
2308 ContextNode *UnmatchedCalleesNode = nullptr;
2309 // Track whether we already assigned original node to a callee.
2310 bool UsedOrigNode = false;
2311 assert(NodeToCallingFunc[Node]);
2312 // Iterate over a copy of Node's callee edges, since we may need to remove
2313 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2314 // makes it less error-prone.
2315 auto CalleeEdges = Node->CalleeEdges;
2316 for (auto &Edge : CalleeEdges) {
2317 if (!Edge->Callee->hasCall())
2318 continue;
2319
2320 // Will be updated below to point to whatever (caller) node this callee edge
2321 // should be moved to.
2322 ContextNode *CallerNodeToUse = nullptr;
2323
2324 // Handle the case where there were no matching calls first. Move this
2325 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2326 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2327 if (!UnmatchedCalleesNode)
2328 UnmatchedCalleesNode =
2329 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2330 CallerNodeToUse = UnmatchedCalleesNode;
2331 } else {
2332 // Look up the information recorded for this callee node, and use the
2333 // recorded caller node (creating it if needed).
2334 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2335 if (!Info->Node) {
2336 // If we haven't assigned any callees to the original node use it.
2337 if (!UsedOrigNode) {
2338 Info->Node = Node;
2339 // Clear the set of matching calls which will be updated below.
2340 Node->MatchingCalls.clear();
2341 UsedOrigNode = true;
2342 } else
2343 Info->Node =
2344 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2345 assert(!Info->Calls.empty());
2346 // The first call becomes the primary call for this caller node, and the
2347 // rest go in the matching calls list.
2348 Info->Node->setCall(Info->Calls.front());
2349 Info->Node->MatchingCalls.insert(Info->Node->MatchingCalls.end(),
2350 Info->Calls.begin() + 1,
2351 Info->Calls.end());
2352 // Save the primary call to node correspondence so that we can update
2353 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2354 // caller of this function.
2355 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2356 }
2357 CallerNodeToUse = Info->Node;
2358 }
2359
2360 // Don't need to move edge if we are using the original node;
2361 if (CallerNodeToUse == Node)
2362 continue;
2363
2364 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2365 }
2366 // Now that we are done moving edges, clean up any caller edges that ended
2367 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2368 // caller edges from Node are replicated onto the new callers, and it
2369 // simplifies the handling to leave them until we have moved all
2370 // edges/context ids.
2371 for (auto &I : CalleeNodeToCallInfo)
2372 removeNoneTypeCallerEdges(I.second->Node);
2373 if (UnmatchedCalleesNode)
2374 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2375 removeNoneTypeCallerEdges(Node);
2376
2377 return true;
2378}
2379
2380uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2381 // In the Module (IR) case this is already the Id.
2382 return IdOrIndex;
2383}
2384
2385uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2386 // In the Index case this is an index into the stack id list in the summary
2387 // index, convert it to an Id.
2388 return Index.getStackIdAtIndex(IdOrIndex);
2389}
2390
2391template <typename DerivedCCG, typename FuncTy, typename CallTy>
2392bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2393 CallTy Call, EdgeIter &EI,
2394 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2395 auto Edge = *EI;
2396 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2397 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2398 // Will be populated in order of callee to caller if we find a chain of tail
2399 // calls between the profiled caller and callee.
2400 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2401 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2402 FoundCalleeChain))
2403 return false;
2404
2405 // The usual case where the profiled callee matches that of the IR/summary.
2406 if (FoundCalleeChain.empty())
2407 return true;
2408
2409 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2410 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2411 // If there is already an edge between these nodes, simply update it and
2412 // return.
2413 if (CurEdge) {
2414 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
2415 Edge->ContextIds.end());
2416 CurEdge->AllocTypes |= Edge->AllocTypes;
2417 return;
2418 }
2419 // Otherwise, create a new edge and insert it into the caller and callee
2420 // lists.
2421 auto NewEdge = std::make_shared<ContextEdge>(
2422 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2423 Callee->CallerEdges.push_back(NewEdge);
2424 if (Caller == Edge->Caller) {
2425 // If we are inserting the new edge into the current edge's caller, insert
2426 // the new edge before the current iterator position, and then increment
2427 // back to the current edge.
2428 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2429 ++EI;
2430 assert(*EI == Edge &&
2431 "Iterator position not restored after insert and increment");
2432 } else
2433 Caller->CalleeEdges.push_back(NewEdge);
2434 };
2435
2436 // Create new nodes for each found callee and connect in between the profiled
2437 // caller and callee.
2438 auto *CurCalleeNode = Edge->Callee;
2439 for (auto &[NewCall, Func] : FoundCalleeChain) {
2440 ContextNode *NewNode = nullptr;
2441 // First check if we have already synthesized a node for this tail call.
2442 if (TailCallToContextNodeMap.count(NewCall)) {
2443 NewNode = TailCallToContextNodeMap[NewCall];
2444 NewNode->AllocTypes |= Edge->AllocTypes;
2445 } else {
2446 FuncToCallsWithMetadata[Func].push_back({NewCall});
2447 // Create Node and record node info.
2448 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2449 TailCallToContextNodeMap[NewCall] = NewNode;
2450 NewNode->AllocTypes = Edge->AllocTypes;
2451 }
2452
2453 // Hook up node to its callee node
2454 AddEdge(NewNode, CurCalleeNode);
2455
2456 CurCalleeNode = NewNode;
2457 }
2458
2459 // Hook up edge's original caller to new callee node.
2460 AddEdge(Edge->Caller, CurCalleeNode);
2461
2462#ifndef NDEBUG
2463 // Save this because Edge's fields get cleared below when removed.
2464 auto *Caller = Edge->Caller;
2465#endif
2466
2467 // Remove old edge
2468 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2469
2470 // To simplify the increment of EI in the caller, subtract one from EI.
2471 // In the final AddEdge call we would have either added a new callee edge,
2472 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2473 // that there is at least one callee edge.
2474 assert(!Caller->CalleeEdges.empty());
2475 --EI;
2476
2477 return true;
2478}
2479
2480bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2481 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2482 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2483 bool &FoundMultipleCalleeChains) {
2484 // Stop recursive search if we have already explored the maximum specified
2485 // depth.
2487 return false;
2488
2489 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2490 FoundCalleeChain.push_back({Callsite, F});
2491 };
2492
2493 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2494 if (!CalleeFunc) {
2495 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2496 assert(Alias);
2497 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2498 assert(CalleeFunc);
2499 }
2500
2501 // Look for tail calls in this function, and check if they either call the
2502 // profiled callee directly, or indirectly (via a recursive search).
2503 // Only succeed if there is a single unique tail call chain found between the
2504 // profiled caller and callee, otherwise we could perform incorrect cloning.
2505 bool FoundSingleCalleeChain = false;
2506 for (auto &BB : *CalleeFunc) {
2507 for (auto &I : BB) {
2508 auto *CB = dyn_cast<CallBase>(&I);
2509 if (!CB || !CB->isTailCall())
2510 continue;
2511 auto *CalledValue = CB->getCalledOperand();
2512 auto *CalledFunction = CB->getCalledFunction();
2513 if (CalledValue && !CalledFunction) {
2514 CalledValue = CalledValue->stripPointerCasts();
2515 // Stripping pointer casts can reveal a called function.
2516 CalledFunction = dyn_cast<Function>(CalledValue);
2517 }
2518 // Check if this is an alias to a function. If so, get the
2519 // called aliasee for the checks below.
2520 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2521 assert(!CalledFunction &&
2522 "Expected null called function in callsite for alias");
2523 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2524 }
2525 if (!CalledFunction)
2526 continue;
2527 if (CalledFunction == ProfiledCallee) {
2528 if (FoundSingleCalleeChain) {
2529 FoundMultipleCalleeChains = true;
2530 return false;
2531 }
2532 FoundSingleCalleeChain = true;
2533 FoundProfiledCalleeCount++;
2534 FoundProfiledCalleeDepth += Depth;
2535 if (Depth > FoundProfiledCalleeMaxDepth)
2536 FoundProfiledCalleeMaxDepth = Depth;
2537 SaveCallsiteInfo(&I, CalleeFunc);
2538 } else if (findProfiledCalleeThroughTailCalls(
2539 ProfiledCallee, CalledFunction, Depth + 1,
2540 FoundCalleeChain, FoundMultipleCalleeChains)) {
2541 // findProfiledCalleeThroughTailCalls should not have returned
2542 // true if FoundMultipleCalleeChains.
2543 assert(!FoundMultipleCalleeChains);
2544 if (FoundSingleCalleeChain) {
2545 FoundMultipleCalleeChains = true;
2546 return false;
2547 }
2548 FoundSingleCalleeChain = true;
2549 SaveCallsiteInfo(&I, CalleeFunc);
2550 } else if (FoundMultipleCalleeChains)
2551 return false;
2552 }
2553 }
2554
2555 return FoundSingleCalleeChain;
2556}
2557
2558const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2559 auto *CB = dyn_cast<CallBase>(Call);
2560 if (!CB->getCalledOperand() || CB->isIndirectCall())
2561 return nullptr;
2562 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2563 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2564 if (Alias)
2565 return dyn_cast<Function>(Alias->getAliasee());
2566 return dyn_cast<Function>(CalleeVal);
2567}
2568
2569bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2570 Instruction *Call, const Function *Func, const Function *CallerFunc,
2571 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2572 auto *CB = dyn_cast<CallBase>(Call);
2573 if (!CB->getCalledOperand() || CB->isIndirectCall())
2574 return false;
2575 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2576 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2577 if (CalleeFunc == Func)
2578 return true;
2579 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2580 if (Alias && Alias->getAliasee() == Func)
2581 return true;
2582
2583 // Recursively search for the profiled callee through tail calls starting with
2584 // the actual Callee. The discovered tail call chain is saved in
2585 // FoundCalleeChain, and we will fixup the graph to include these callsites
2586 // after returning.
2587 // FIXME: We will currently redo the same recursive walk if we find the same
2588 // mismatched callee from another callsite. We can improve this with more
2589 // bookkeeping of the created chain of new nodes for each mismatch.
2590 unsigned Depth = 1;
2591 bool FoundMultipleCalleeChains = false;
2592 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2593 FoundCalleeChain,
2594 FoundMultipleCalleeChains)) {
2595 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2596 << Func->getName() << " from " << CallerFunc->getName()
2597 << " that actually called " << CalleeVal->getName()
2598 << (FoundMultipleCalleeChains
2599 ? " (found multiple possible chains)"
2600 : "")
2601 << "\n");
2602 if (FoundMultipleCalleeChains)
2603 FoundProfiledCalleeNonUniquelyCount++;
2604 return false;
2605 }
2606
2607 return true;
2608}
2609
2610bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2611 Instruction *Call2) {
2612 auto *CB1 = cast<CallBase>(Call1);
2613 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2614 return false;
2615 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2616 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2617 auto *CB2 = cast<CallBase>(Call2);
2618 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2619 return false;
2620 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2621 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2622 return CalleeFunc1 == CalleeFunc2;
2623}
2624
2625bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2626 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2627 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2628 bool &FoundMultipleCalleeChains) {
2629 // Stop recursive search if we have already explored the maximum specified
2630 // depth.
2632 return false;
2633
2634 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2635 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2636 // been synthesized.
2637 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2638 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2639 // StackIds is empty (we don't have debug info available in the index for
2640 // these callsites)
2641 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2642 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2643 CallsiteInfo *NewCallsiteInfo =
2644 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2645 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2646 };
2647
2648 // Look for tail calls in this function, and check if they either call the
2649 // profiled callee directly, or indirectly (via a recursive search).
2650 // Only succeed if there is a single unique tail call chain found between the
2651 // profiled caller and callee, otherwise we could perform incorrect cloning.
2652 bool FoundSingleCalleeChain = false;
2653 for (auto &S : CurCallee.getSummaryList()) {
2654 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2655 !isPrevailing(CurCallee.getGUID(), S.get()))
2656 continue;
2657 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2658 if (!FS)
2659 continue;
2660 auto FSVI = CurCallee;
2661 auto *AS = dyn_cast<AliasSummary>(S.get());
2662 if (AS)
2663 FSVI = AS->getAliaseeVI();
2664 for (auto &CallEdge : FS->calls()) {
2665 if (!CallEdge.second.hasTailCall())
2666 continue;
2667 if (CallEdge.first == ProfiledCallee) {
2668 if (FoundSingleCalleeChain) {
2669 FoundMultipleCalleeChains = true;
2670 return false;
2671 }
2672 FoundSingleCalleeChain = true;
2673 FoundProfiledCalleeCount++;
2674 FoundProfiledCalleeDepth += Depth;
2675 if (Depth > FoundProfiledCalleeMaxDepth)
2676 FoundProfiledCalleeMaxDepth = Depth;
2677 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2678 // Add FS to FSToVIMap in case it isn't already there.
2679 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2680 FSToVIMap[FS] = FSVI;
2681 } else if (findProfiledCalleeThroughTailCalls(
2682 ProfiledCallee, CallEdge.first, Depth + 1,
2683 FoundCalleeChain, FoundMultipleCalleeChains)) {
2684 // findProfiledCalleeThroughTailCalls should not have returned
2685 // true if FoundMultipleCalleeChains.
2686 assert(!FoundMultipleCalleeChains);
2687 if (FoundSingleCalleeChain) {
2688 FoundMultipleCalleeChains = true;
2689 return false;
2690 }
2691 FoundSingleCalleeChain = true;
2692 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2693 // Add FS to FSToVIMap in case it isn't already there.
2694 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2695 FSToVIMap[FS] = FSVI;
2696 } else if (FoundMultipleCalleeChains)
2697 return false;
2698 }
2699 }
2700
2701 return FoundSingleCalleeChain;
2702}
2703
2704const FunctionSummary *
2705IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2706 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2707 if (Callee.getSummaryList().empty())
2708 return nullptr;
2709 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2710}
2711
2712bool IndexCallsiteContextGraph::calleeMatchesFunc(
2713 IndexCall &Call, const FunctionSummary *Func,
2714 const FunctionSummary *CallerFunc,
2715 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2716 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2717 // If there is no summary list then this is a call to an externally defined
2718 // symbol.
2719 AliasSummary *Alias =
2720 Callee.getSummaryList().empty()
2721 ? nullptr
2722 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2723 assert(FSToVIMap.count(Func));
2724 auto FuncVI = FSToVIMap[Func];
2725 if (Callee == FuncVI ||
2726 // If callee is an alias, check the aliasee, since only function
2727 // summary base objects will contain the stack node summaries and thus
2728 // get a context node.
2729 (Alias && Alias->getAliaseeVI() == FuncVI))
2730 return true;
2731
2732 // Recursively search for the profiled callee through tail calls starting with
2733 // the actual Callee. The discovered tail call chain is saved in
2734 // FoundCalleeChain, and we will fixup the graph to include these callsites
2735 // after returning.
2736 // FIXME: We will currently redo the same recursive walk if we find the same
2737 // mismatched callee from another callsite. We can improve this with more
2738 // bookkeeping of the created chain of new nodes for each mismatch.
2739 unsigned Depth = 1;
2740 bool FoundMultipleCalleeChains = false;
2741 if (!findProfiledCalleeThroughTailCalls(
2742 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2743 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2744 << " from " << FSToVIMap[CallerFunc]
2745 << " that actually called " << Callee
2746 << (FoundMultipleCalleeChains
2747 ? " (found multiple possible chains)"
2748 : "")
2749 << "\n");
2750 if (FoundMultipleCalleeChains)
2751 FoundProfiledCalleeNonUniquelyCount++;
2752 return false;
2753 }
2754
2755 return true;
2756}
2757
2758bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2759 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2760 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2761 return Callee1 == Callee2;
2762}
2763
2764template <typename DerivedCCG, typename FuncTy, typename CallTy>
2765void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2766 const {
2767 print(dbgs());
2768 dbgs() << "\n";
2769}
2770
2771template <typename DerivedCCG, typename FuncTy, typename CallTy>
2772void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2773 raw_ostream &OS) const {
2774 OS << "Node " << this << "\n";
2775 OS << "\t";
2776 printCall(OS);
2777 if (Recursive)
2778 OS << " (recursive)";
2779 OS << "\n";
2780 if (!MatchingCalls.empty()) {
2781 OS << "\tMatchingCalls:\n";
2782 for (auto &MatchingCall : MatchingCalls) {
2783 OS << "\t";
2784 MatchingCall.print(OS);
2785 OS << "\n";
2786 }
2787 }
2788 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2789 OS << "\tContextIds:";
2790 // Make a copy of the computed context ids that we can sort for stability.
2791 auto ContextIds = getContextIds();
2792 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2793 std::sort(SortedIds.begin(), SortedIds.end());
2794 for (auto Id : SortedIds)
2795 OS << " " << Id;
2796 OS << "\n";
2797 OS << "\tCalleeEdges:\n";
2798 for (auto &Edge : CalleeEdges)
2799 OS << "\t\t" << *Edge << "\n";
2800 OS << "\tCallerEdges:\n";
2801 for (auto &Edge : CallerEdges)
2802 OS << "\t\t" << *Edge << "\n";
2803 if (!Clones.empty()) {
2804 OS << "\tClones: ";
2805 ListSeparator LS;
2806 for (auto *Clone : Clones)
2807 OS << LS << Clone;
2808 OS << "\n";
2809 } else if (CloneOf) {
2810 OS << "\tClone of " << CloneOf << "\n";
2811 }
2812}
2813
2814template <typename DerivedCCG, typename FuncTy, typename CallTy>
2815void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2816 const {
2817 print(dbgs());
2818 dbgs() << "\n";
2819}
2820
2821template <typename DerivedCCG, typename FuncTy, typename CallTy>
2822void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2823 raw_ostream &OS) const {
2824 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2825 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2826 OS << " ContextIds:";
2827 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2828 std::sort(SortedIds.begin(), SortedIds.end());
2829 for (auto Id : SortedIds)
2830 OS << " " << Id;
2831}
2832
2833template <typename DerivedCCG, typename FuncTy, typename CallTy>
2834void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2835 print(dbgs());
2836}
2837
2838template <typename DerivedCCG, typename FuncTy, typename CallTy>
2839void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2840 raw_ostream &OS) const {
2841 OS << "Callsite Context Graph:\n";
2842 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2843 for (const auto Node : nodes<GraphType>(this)) {
2844 if (Node->isRemoved())
2845 continue;
2846 Node->print(OS);
2847 OS << "\n";
2848 }
2849}
2850
2851template <typename DerivedCCG, typename FuncTy, typename CallTy>
2852void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2853 raw_ostream &OS) const {
2854 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2855 for (const auto Node : nodes<GraphType>(this)) {
2856 if (Node->isRemoved())
2857 continue;
2858 if (!Node->IsAllocation)
2859 continue;
2860 DenseSet<uint32_t> ContextIds = Node->getContextIds();
2861 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
2862 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2863 std::sort(SortedIds.begin(), SortedIds.end());
2864 for (auto Id : SortedIds) {
2865 auto TypeI = ContextIdToAllocationType.find(Id);
2866 assert(TypeI != ContextIdToAllocationType.end());
2867 auto CSI = ContextIdToContextSizeInfos.find(Id);
2868 if (CSI != ContextIdToContextSizeInfos.end()) {
2869 for (auto &Info : CSI->second) {
2870 OS << "MemProf hinting: "
2871 << getAllocTypeString((uint8_t)TypeI->second)
2872 << " full allocation context " << Info.FullStackId
2873 << " with total size " << Info.TotalSize << " is "
2874 << getAllocTypeString(Node->AllocTypes) << " after cloning";
2875 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
2876 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
2877 << " due to cold byte percent";
2878 OS << "\n";
2879 }
2880 }
2881 }
2882 }
2883}
2884
2885template <typename DerivedCCG, typename FuncTy, typename CallTy>
2886void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2887 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2888 for (const auto Node : nodes<GraphType>(this)) {
2889 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2890 for (auto &Edge : Node->CallerEdges)
2891 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2892 }
2893}
2894
2895template <typename DerivedCCG, typename FuncTy, typename CallTy>
2896struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2897 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2898 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2899
2900 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2901 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2902
2905 decltype(&getNode)>;
2906
2908 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2909 }
2910
2912 return nodes_iterator(G->NodeOwner.end(), &getNode);
2913 }
2914
2916 return G->NodeOwner.begin()->get();
2917 }
2918
2919 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2920 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2922 return P->Callee;
2923 }
2924
2926 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2927 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2928 decltype(&GetCallee)>;
2929
2931 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2932 }
2933
2935 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2936 }
2937};
2938
2939template <typename DerivedCCG, typename FuncTy, typename CallTy>
2940struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2941 : public DefaultDOTGraphTraits {
2942 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2943
2944 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2946 using NodeRef = typename GTraits::NodeRef;
2947 using ChildIteratorType = typename GTraits::ChildIteratorType;
2948
2949 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2950 std::string LabelString =
2951 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2952 Twine(Node->OrigStackOrAllocId))
2953 .str();
2954 LabelString += "\n";
2955 if (Node->hasCall()) {
2956 auto Func = G->NodeToCallingFunc.find(Node);
2957 assert(Func != G->NodeToCallingFunc.end());
2958 LabelString +=
2959 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2960 } else {
2961 LabelString += "null call";
2962 if (Node->Recursive)
2963 LabelString += " (recursive)";
2964 else
2965 LabelString += " (external)";
2966 }
2967 return LabelString;
2968 }
2969
2970 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2971 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2972 getContextIds(Node->getContextIds()) + "\"")
2973 .str();
2974 AttributeString +=
2975 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2976 AttributeString += ",style=\"filled\"";
2977 if (Node->CloneOf) {
2978 AttributeString += ",color=\"blue\"";
2979 AttributeString += ",style=\"filled,bold,dashed\"";
2980 } else
2981 AttributeString += ",style=\"filled\"";
2982 return AttributeString;
2983 }
2984
2985 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2986 GraphType) {
2987 auto &Edge = *(ChildIter.getCurrent());
2988 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2989 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2990 .str();
2991 }
2992
2993 // Since the NodeOwners list includes nodes that are no longer connected to
2994 // the graph, skip them here.
2995 static bool isNodeHidden(NodeRef Node, GraphType) {
2996 return Node->isRemoved();
2997 }
2998
2999private:
3000 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3001 std::string IdString = "ContextIds:";
3002 if (ContextIds.size() < 100) {
3003 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3004 std::sort(SortedIds.begin(), SortedIds.end());
3005 for (auto Id : SortedIds)
3006 IdString += (" " + Twine(Id)).str();
3007 } else {
3008 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3009 }
3010 return IdString;
3011 }
3012
3013 static std::string getColor(uint8_t AllocTypes) {
3014 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3015 // Color "brown1" actually looks like a lighter red.
3016 return "brown1";
3017 if (AllocTypes == (uint8_t)AllocationType::Cold)
3018 return "cyan";
3019 if (AllocTypes ==
3021 // Lighter purple.
3022 return "mediumorchid1";
3023 return "gray";
3024 }
3025
3026 static std::string getNodeId(NodeRef Node) {
3027 std::stringstream SStream;
3028 SStream << std::hex << "N0x" << (unsigned long long)Node;
3029 std::string Result = SStream.str();
3030 return Result;
3031 }
3032};
3033
3034template <typename DerivedCCG, typename FuncTy, typename CallTy>
3035void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3036 std::string Label) const {
3037 WriteGraph(this, "", false, Label,
3038 DotFilePathPrefix + "ccg." + Label + ".dot");
3039}
3040
3041template <typename DerivedCCG, typename FuncTy, typename CallTy>
3042typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3043CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3044 const std::shared_ptr<ContextEdge> &Edge,
3045 DenseSet<uint32_t> ContextIdsToMove) {
3046 ContextNode *Node = Edge->Callee;
3047 assert(NodeToCallingFunc.count(Node));
3048 ContextNode *Clone =
3049 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3050 Node->addClone(Clone);
3051 Clone->MatchingCalls = Node->MatchingCalls;
3052 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3053 ContextIdsToMove);
3054 return Clone;
3055}
3056
3057template <typename DerivedCCG, typename FuncTy, typename CallTy>
3058void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3059 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3060 ContextNode *NewCallee, bool NewClone,
3061 DenseSet<uint32_t> ContextIdsToMove) {
3062 // NewCallee and Edge's current callee must be clones of the same original
3063 // node (Edge's current callee may be the original node too).
3064 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3065
3066 ContextNode *OldCallee = Edge->Callee;
3067
3068 // We might already have an edge to the new callee from earlier cloning for a
3069 // different allocation. If one exists we will reuse it.
3070 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3071
3072 // Callers will pass an empty ContextIdsToMove set when they want to move the
3073 // edge. Copy in Edge's ids for simplicity.
3074 if (ContextIdsToMove.empty())
3075 ContextIdsToMove = Edge->getContextIds();
3076
3077 // If we are moving all of Edge's ids, then just move the whole Edge.
3078 // Otherwise only move the specified subset, to a new edge if needed.
3079 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3080 // First, update the alloc types on New Callee from Edge.
3081 // Do this before we potentially clear Edge's fields below!
3082 NewCallee->AllocTypes |= Edge->AllocTypes;
3083 // Moving the whole Edge.
3084 if (ExistingEdgeToNewCallee) {
3085 // Since we already have an edge to NewCallee, simply move the ids
3086 // onto it, and remove the existing Edge.
3087 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3088 ContextIdsToMove.end());
3089 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3090 assert(Edge->ContextIds == ContextIdsToMove);
3091 removeEdgeFromGraph(Edge.get());
3092 } else {
3093 // Otherwise just reconnect Edge to NewCallee.
3094 Edge->Callee = NewCallee;
3095 NewCallee->CallerEdges.push_back(Edge);
3096 // Remove it from callee where it was previously connected.
3097 OldCallee->eraseCallerEdge(Edge.get());
3098 // Don't need to update Edge's context ids since we are simply
3099 // reconnecting it.
3100 }
3101 } else {
3102 // Only moving a subset of Edge's ids.
3103 // Compute the alloc type of the subset of ids being moved.
3104 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3105 if (ExistingEdgeToNewCallee) {
3106 // Since we already have an edge to NewCallee, simply move the ids
3107 // onto it.
3108 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3109 ContextIdsToMove.end());
3110 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3111 } else {
3112 // Otherwise, create a new edge to NewCallee for the ids being moved.
3113 auto NewEdge = std::make_shared<ContextEdge>(
3114 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3115 Edge->Caller->CalleeEdges.push_back(NewEdge);
3116 NewCallee->CallerEdges.push_back(NewEdge);
3117 }
3118 // In either case, need to update the alloc types on NewCallee, and remove
3119 // those ids and update the alloc type on the original Edge.
3120 NewCallee->AllocTypes |= CallerEdgeAllocType;
3121 set_subtract(Edge->ContextIds, ContextIdsToMove);
3122 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3123 }
3124 // Now walk the old callee node's callee edges and move Edge's context ids
3125 // over to the corresponding edge into the clone (which is created here if
3126 // this is a newly created clone).
3127 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3128 // The context ids moving to the new callee are the subset of this edge's
3129 // context ids and the context ids on the caller edge being moved.
3130 DenseSet<uint32_t> EdgeContextIdsToMove =
3131 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3132 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3133 OldCalleeEdge->AllocTypes =
3134 computeAllocType(OldCalleeEdge->getContextIds());
3135 if (!NewClone) {
3136 // Update context ids / alloc type on corresponding edge to NewCallee.
3137 // There is a chance this may not exist if we are reusing an existing
3138 // clone, specifically during function assignment, where we would have
3139 // removed none type edges after creating the clone. If we can't find
3140 // a corresponding edge there, fall through to the cloning below.
3141 if (auto *NewCalleeEdge =
3142 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
3143 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3144 EdgeContextIdsToMove.end());
3145 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3146 continue;
3147 }
3148 }
3149 auto NewEdge = std::make_shared<ContextEdge>(
3150 OldCalleeEdge->Callee, NewCallee,
3151 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3152 NewCallee->CalleeEdges.push_back(NewEdge);
3153 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3154 }
3155 // Recompute the node alloc type now that its callee edges have been
3156 // updated (since we will compute from those edges).
3157 OldCallee->AllocTypes = OldCallee->computeAllocType();
3158 // OldCallee alloc type should be None iff its context id set is now empty.
3159 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3160 OldCallee->emptyContextIds());
3161 if (VerifyCCG) {
3162 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3163 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3164 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3165 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3166 /*CheckEdges=*/false);
3167 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3168 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3169 /*CheckEdges=*/false);
3170 }
3171}
3172
3173template <typename DerivedCCG, typename FuncTy, typename CallTy>
3174void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3175 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3176 ContextNode *NewCaller) {
3177
3178 ContextNode *OldCaller = Edge->Caller;
3179 OldCaller->eraseCalleeEdge(Edge.get());
3180
3181 // We might already have an edge to the new caller. If one exists we will
3182 // reuse it.
3183 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee);
3184
3185 if (ExistingEdgeToNewCaller) {
3186 // Since we already have an edge to NewCaller, simply move the ids
3187 // onto it, and remove the existing Edge.
3188 ExistingEdgeToNewCaller->getContextIds().insert(
3189 Edge->getContextIds().begin(), Edge->getContextIds().end());
3190 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3191 Edge->ContextIds.clear();
3192 Edge->AllocTypes = (uint8_t)AllocationType::None;
3193 Edge->Callee->eraseCallerEdge(Edge.get());
3194 } else {
3195 // Otherwise just reconnect Edge to NewCaller.
3196 Edge->Caller = NewCaller;
3197 NewCaller->CalleeEdges.push_back(Edge);
3198 // Don't need to update Edge's context ids since we are simply
3199 // reconnecting it.
3200 }
3201 // In either case, need to update the alloc types on New Caller.
3202 NewCaller->AllocTypes |= Edge->AllocTypes;
3203
3204 // Now walk the old caller node's caller edges and move Edge's context ids
3205 // over to the corresponding edge into the node (which is created here if
3206 // this is a newly created node). We can tell whether this is a newly created
3207 // node by seeing if it has any caller edges yet.
3208#ifndef NDEBUG
3209 bool IsNewNode = NewCaller->CallerEdges.empty();
3210#endif
3211 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3212 // The context ids moving to the new caller are the subset of this edge's
3213 // context ids and the context ids on the callee edge being moved.
3214 DenseSet<uint32_t> EdgeContextIdsToMove =
3215 set_intersection(OldCallerEdge->getContextIds(), Edge->getContextIds());
3216 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3217 OldCallerEdge->AllocTypes =
3218 computeAllocType(OldCallerEdge->getContextIds());
3219 // In this function we expect that any pre-existing node already has edges
3220 // from the same callers as the old node. That should be true in the current
3221 // use case, where we will remove None-type edges after copying over all
3222 // caller edges from the callee.
3223 auto *ExistingCallerEdge =
3224 NewCaller->findEdgeFromCaller(OldCallerEdge->Caller);
3225 assert(IsNewNode || ExistingCallerEdge);
3226 if (ExistingCallerEdge) {
3227 ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3228 EdgeContextIdsToMove.end());
3229 ExistingCallerEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3230 continue;
3231 }
3232 auto NewEdge = std::make_shared<ContextEdge>(
3233 NewCaller, OldCallerEdge->Caller,
3234 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3235 NewCaller->CallerEdges.push_back(NewEdge);
3236 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3237 }
3238 // Recompute the node alloc type now that its caller edges have been
3239 // updated (since we will compute from those edges).
3240 OldCaller->AllocTypes = OldCaller->computeAllocType();
3241 // OldCaller alloc type should be None iff its context id set is now empty.
3242 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3243 OldCaller->emptyContextIds());
3244 if (VerifyCCG) {
3245 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3246 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3247 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3248 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3249 /*CheckEdges=*/false);
3250 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3251 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3252 /*CheckEdges=*/false);
3253 }
3254}
3255
3256template <typename DerivedCCG, typename FuncTy, typename CallTy>
3257void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3258 recursivelyRemoveNoneTypeCalleeEdges(
3259 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3260 auto Inserted = Visited.insert(Node);
3261 if (!Inserted.second)
3262 return;
3263
3264 removeNoneTypeCalleeEdges(Node);
3265
3266 for (auto *Clone : Node->Clones)
3267 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3268
3269 // The recursive call may remove some of this Node's caller edges.
3270 // Iterate over a copy and skip any that were removed.
3271 auto CallerEdges = Node->CallerEdges;
3272 for (auto &Edge : CallerEdges) {
3273 // Skip any that have been removed by an earlier recursive call.
3274 if (Edge->isRemoved()) {
3275 assert(!is_contained(Node->CallerEdges, Edge));
3276 continue;
3277 }
3278 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3279 }
3280}
3281
3282template <typename DerivedCCG, typename FuncTy, typename CallTy>
3283void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3285 for (auto &Entry : AllocationCallToContextNodeMap) {
3286 Visited.clear();
3287 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3288 }
3289 Visited.clear();
3290 for (auto &Entry : AllocationCallToContextNodeMap)
3291 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3292 if (VerifyCCG)
3293 check();
3294}
3295
3296// helper function to check an AllocType is cold or notcold or both.
3298 return (AllocType == (uint8_t)AllocationType::Cold) ||
3300 (AllocType ==
3302}
3303
3304template <typename DerivedCCG, typename FuncTy, typename CallTy>
3305void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3306 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3307 const DenseSet<uint32_t> &AllocContextIds) {
3308 if (VerifyNodes)
3309 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3310 assert(!Node->CloneOf);
3311
3312 // If Node as a null call, then either it wasn't found in the module (regular
3313 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3314 // cloning (e.g. recursion, calls multiple targets, etc).
3315 // Do this here so that we don't try to recursively clone callers below, which
3316 // isn't useful at least for this node.
3317 if (!Node->hasCall())
3318 return;
3319
3320#ifndef NDEBUG
3321 auto Insert =
3322#endif
3323 Visited.insert(Node);
3324 // We should not have visited this node yet.
3325 assert(Insert.second);
3326 // The recursive call to identifyClones may delete the current edge from the
3327 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3328 // in an iterator and having recursive call erase from it. Other edges may
3329 // also get removed during the recursion, which will have null Callee and
3330 // Caller pointers (and are deleted later), so we skip those below.
3331 {
3332 auto CallerEdges = Node->CallerEdges;
3333 for (auto &Edge : CallerEdges) {
3334 // Skip any that have been removed by an earlier recursive call.
3335 if (Edge->isRemoved()) {
3336 assert(!is_contained(Node->CallerEdges, Edge));
3337 continue;
3338 }
3339 // Ignore any caller we previously visited via another edge.
3340 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3341 identifyClones(Edge->Caller, Visited, AllocContextIds);
3342 }
3343 }
3344 }
3345
3346 // Check if we reached an unambiguous call or have have only a single caller.
3347 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3348 return;
3349
3350 // We need to clone.
3351
3352 // Try to keep the original version as alloc type NotCold. This will make
3353 // cases with indirect calls or any other situation with an unknown call to
3354 // the original function get the default behavior. We do this by sorting the
3355 // CallerEdges of the Node we will clone by alloc type.
3356 //
3357 // Give NotCold edge the lowest sort priority so those edges are at the end of
3358 // the caller edges vector, and stay on the original version (since the below
3359 // code clones greedily until it finds all remaining edges have the same type
3360 // and leaves the remaining ones on the original Node).
3361 //
3362 // We shouldn't actually have any None type edges, so the sorting priority for
3363 // that is arbitrary, and we assert in that case below.
3364 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3365 /*Cold*/ 1,
3366 /*NotColdCold*/ 2};
3367 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
3368 [&](const std::shared_ptr<ContextEdge> &A,
3369 const std::shared_ptr<ContextEdge> &B) {
3370 // Nodes with non-empty context ids should be sorted before
3371 // those with empty context ids.
3372 if (A->ContextIds.empty())
3373 // Either B ContextIds are non-empty (in which case we
3374 // should return false because B < A), or B ContextIds
3375 // are empty, in which case they are equal, and we should
3376 // maintain the original relative ordering.
3377 return false;
3378 if (B->ContextIds.empty())
3379 return true;
3380
3381 if (A->AllocTypes == B->AllocTypes)
3382 // Use the first context id for each edge as a
3383 // tie-breaker.
3384 return *A->ContextIds.begin() < *B->ContextIds.begin();
3385 return AllocTypeCloningPriority[A->AllocTypes] <
3386 AllocTypeCloningPriority[B->AllocTypes];
3387 });
3388
3389 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3390
3391 DenseSet<uint32_t> RecursiveContextIds;
3392 // If we are allowing recursive callsites, but have also disabled recursive
3393 // contexts, look for context ids that show up in multiple caller edges.
3395 DenseSet<uint32_t> AllCallerContextIds;
3396 for (auto &CE : Node->CallerEdges) {
3397 // Resize to the largest set of caller context ids, since we know the
3398 // final set will be at least that large.
3399 AllCallerContextIds.reserve(CE->getContextIds().size());
3400 for (auto Id : CE->getContextIds())
3401 if (!AllCallerContextIds.insert(Id).second)
3402 RecursiveContextIds.insert(Id);
3403 }
3404 }
3405
3406 // Iterate until we find no more opportunities for disambiguating the alloc
3407 // types via cloning. In most cases this loop will terminate once the Node
3408 // has a single allocation type, in which case no more cloning is needed.
3409 // Iterate over a copy of Node's caller edges, since we may need to remove
3410 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3411 // makes it less error-prone.
3412 auto CallerEdges = Node->CallerEdges;
3413 for (auto &CallerEdge : CallerEdges) {
3414 // See if cloning the prior caller edge left this node with a single alloc
3415 // type or a single caller. In that case no more cloning of Node is needed.
3416 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3417 break;
3418
3419 // If the caller was not successfully matched to a call in the IR/summary,
3420 // there is no point in trying to clone for it as we can't update that call.
3421 if (!CallerEdge->Caller->hasCall())
3422 continue;
3423
3424 // Only need to process the ids along this edge pertaining to the given
3425 // allocation.
3426 auto CallerEdgeContextsForAlloc =
3427 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3428 if (!RecursiveContextIds.empty())
3429 CallerEdgeContextsForAlloc =
3430 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3431 if (CallerEdgeContextsForAlloc.empty())
3432 continue;
3433
3434 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3435
3436 // Compute the node callee edge alloc types corresponding to the context ids
3437 // for this caller edge.
3438 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3439 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3440 for (auto &CalleeEdge : Node->CalleeEdges)
3441 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3442 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3443
3444 // Don't clone if doing so will not disambiguate any alloc types amongst
3445 // caller edges (including the callee edges that would be cloned).
3446 // Otherwise we will simply move all edges to the clone.
3447 //
3448 // First check if by cloning we will disambiguate the caller allocation
3449 // type from node's allocation type. Query allocTypeToUse so that we don't
3450 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3451 // neither of these should be None type.
3452 //
3453 // Then check if by cloning node at least one of the callee edges will be
3454 // disambiguated by splitting out different context ids.
3455 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3456 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3457 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
3458 allocTypeToUse(Node->AllocTypes) &&
3459 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3460 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges))
3461 continue;
3462
3463 // First see if we can use an existing clone. Check each clone and its
3464 // callee edges for matching alloc types.
3465 ContextNode *Clone = nullptr;
3466 for (auto *CurClone : Node->Clones) {
3467 if (allocTypeToUse(CurClone->AllocTypes) !=
3468 allocTypeToUse(CallerAllocTypeForAlloc))
3469 continue;
3470
3471 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3472 hasSingleAllocType(CallerAllocTypeForAlloc);
3473 // The above check should mean that if both have single alloc types that
3474 // they should be equal.
3475 assert(!BothSingleAlloc ||
3476 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3477
3478 // If either both have a single alloc type (which are the same), or if the
3479 // clone's callee edges have the same alloc types as those for the current
3480 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3481 // then we can reuse this clone.
3482 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3483 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3484 Clone = CurClone;
3485 break;
3486 }
3487 }
3488
3489 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3490 if (Clone)
3491 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3492 CallerEdgeContextsForAlloc);
3493 else
3494 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3495
3496 // Sanity check that no alloc types on clone or its edges are None.
3497 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3498 }
3499
3500 // We should still have some context ids on the original Node.
3501 assert(!Node->emptyContextIds());
3502
3503 // Sanity check that no alloc types on node or edges are None.
3504 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3505
3506 if (VerifyNodes)
3507 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3508}
3509
3510void ModuleCallsiteContextGraph::updateAllocationCall(
3512 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3513 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3514 "memprof", AllocTypeString);
3515 cast<CallBase>(Call.call())->addFnAttr(A);
3516 OREGetter(Call.call()->getFunction())
3517 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3518 << ore::NV("AllocationCall", Call.call()) << " in clone "
3519 << ore::NV("Caller", Call.call()->getFunction())
3520 << " marked with memprof allocation attribute "
3521 << ore::NV("Attribute", AllocTypeString));
3522}
3523
3524void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3526 auto *AI = cast<AllocInfo *>(Call.call());
3527 assert(AI);
3528 assert(AI->Versions.size() > Call.cloneNo());
3529 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3530}
3531
3533ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3534 const auto *CB = cast<CallBase>(Call.call());
3535 if (!CB->getAttributes().hasFnAttr("memprof"))
3536 return AllocationType::None;
3537 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3540}
3541
3543IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3544 const auto *AI = cast<AllocInfo *>(Call.call());
3545 assert(AI->Versions.size() > Call.cloneNo());
3546 return (AllocationType)AI->Versions[Call.cloneNo()];
3547}
3548
3549void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3550 FuncInfo CalleeFunc) {
3551 if (CalleeFunc.cloneNo() > 0)
3552 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
3553 OREGetter(CallerCall.call()->getFunction())
3554 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
3555 << ore::NV("Call", CallerCall.call()) << " in clone "
3556 << ore::NV("Caller", CallerCall.call()->getFunction())
3557 << " assigned to call function clone "
3558 << ore::NV("Callee", CalleeFunc.func()));
3559}
3560
3561void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3562 FuncInfo CalleeFunc) {
3563 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
3564 assert(CI &&
3565 "Caller cannot be an allocation which should not have profiled calls");
3566 assert(CI->Clones.size() > CallerCall.cloneNo());
3567 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3568}
3569
3570CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
3571 Instruction *>::FuncInfo
3572ModuleCallsiteContextGraph::cloneFunctionForCallsite(
3573 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3574 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3575 // Use existing LLVM facilities for cloning and obtaining Call in clone
3576 ValueToValueMapTy VMap;
3577 auto *NewFunc = CloneFunction(Func.func(), VMap);
3578 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
3579 assert(!Func.func()->getParent()->getFunction(Name));
3580 NewFunc->setName(Name);
3581 for (auto &Inst : CallsWithMetadataInFunc) {
3582 // This map always has the initial version in it.
3583 assert(Inst.cloneNo() == 0);
3584 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
3585 }
3586 OREGetter(Func.func())
3587 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
3588 << "created clone " << ore::NV("NewFunction", NewFunc));
3589 return {NewFunc, CloneNo};
3590}
3591
3592CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
3593 IndexCall>::FuncInfo
3594IndexCallsiteContextGraph::cloneFunctionForCallsite(
3595 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3596 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3597 // Check how many clones we have of Call (and therefore function).
3598 // The next clone number is the current size of versions array.
3599 // Confirm this matches the CloneNo provided by the caller, which is based on
3600 // the number of function clones we have.
3601 assert(CloneNo ==
3602 (isa<AllocInfo *>(Call.call())
3603 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
3604 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
3605 // Walk all the instructions in this function. Create a new version for
3606 // each (by adding an entry to the Versions/Clones summary array), and copy
3607 // over the version being called for the function clone being cloned here.
3608 // Additionally, add an entry to the CallMap for the new function clone,
3609 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
3610 // to the new call clone.
3611 for (auto &Inst : CallsWithMetadataInFunc) {
3612 // This map always has the initial version in it.
3613 assert(Inst.cloneNo() == 0);
3614 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
3615 assert(AI->Versions.size() == CloneNo);
3616 // We assign the allocation type later (in updateAllocationCall), just add
3617 // an entry for it here.
3618 AI->Versions.push_back(0);
3619 } else {
3620 auto *CI = cast<CallsiteInfo *>(Inst.call());
3621 assert(CI && CI->Clones.size() == CloneNo);
3622 // We assign the clone number later (in updateCall), just add an entry for
3623 // it here.
3624 CI->Clones.push_back(0);
3625 }
3626 CallMap[Inst] = {Inst.call(), CloneNo};
3627 }
3628 return {Func.func(), CloneNo};
3629}
3630
3631// This method assigns cloned callsites to functions, cloning the functions as
3632// needed. The assignment is greedy and proceeds roughly as follows:
3633//
3634// For each function Func:
3635// For each call with graph Node having clones:
3636// Initialize ClonesWorklist to Node and its clones
3637// Initialize NodeCloneCount to 0
3638// While ClonesWorklist is not empty:
3639// Clone = pop front ClonesWorklist
3640// NodeCloneCount++
3641// If Func has been cloned less than NodeCloneCount times:
3642// If NodeCloneCount is 1:
3643// Assign Clone to original Func
3644// Continue
3645// Create a new function clone
3646// If other callers not assigned to call a function clone yet:
3647// Assign them to call new function clone
3648// Continue
3649// Assign any other caller calling the cloned version to new clone
3650//
3651// For each caller of Clone:
3652// If caller is assigned to call a specific function clone:
3653// If we cannot assign Clone to that function clone:
3654// Create new callsite Clone NewClone
3655// Add NewClone to ClonesWorklist
3656// Continue
3657// Assign Clone to existing caller's called function clone
3658// Else:
3659// If Clone not already assigned to a function clone:
3660// Assign to first function clone without assignment
3661// Assign caller to selected function clone
3662template <typename DerivedCCG, typename FuncTy, typename CallTy>
3663bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
3664 bool Changed = false;
3665
3666 // Keep track of the assignment of nodes (callsites) to function clones they
3667 // call.
3668 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
3669
3670 // Update caller node to call function version CalleeFunc, by recording the
3671 // assignment in CallsiteToCalleeFuncCloneMap.
3672 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
3673 const FuncInfo &CalleeFunc) {
3674 assert(Caller->hasCall());
3675 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
3676 };
3677
3678 // Walk all functions for which we saw calls with memprof metadata, and handle
3679 // cloning for each of its calls.
3680 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
3681 FuncInfo OrigFunc(Func);
3682 // Map from each clone of OrigFunc to a map of remappings of each call of
3683 // interest (from original uncloned call to the corresponding cloned call in
3684 // that function clone).
3685 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
3686 for (auto &Call : CallsWithMetadata) {
3687 ContextNode *Node = getNodeForInst(Call);
3688 // Skip call if we do not have a node for it (all uses of its stack ids
3689 // were either on inlined chains or pruned from the MIBs), or if we did
3690 // not create any clones for it.
3691 if (!Node || Node->Clones.empty())
3692 continue;
3693 assert(Node->hasCall() &&
3694 "Not having a call should have prevented cloning");
3695
3696 // Track the assignment of function clones to clones of the current
3697 // callsite Node being handled.
3698 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3699
3700 // Assign callsite version CallsiteClone to function version FuncClone,
3701 // and also assign (possibly cloned) Call to CallsiteClone.
3702 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3703 CallInfo &Call,
3704 ContextNode *CallsiteClone,
3705 bool IsAlloc) {
3706 // Record the clone of callsite node assigned to this function clone.
3707 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3708
3709 assert(FuncClonesToCallMap.count(FuncClone));
3710 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3711 CallInfo CallClone(Call);
3712 if (CallMap.count(Call))
3713 CallClone = CallMap[Call];
3714 CallsiteClone->setCall(CallClone);
3715 // Need to do the same for all matching calls.
3716 for (auto &MatchingCall : Node->MatchingCalls) {
3717 CallInfo CallClone(MatchingCall);
3718 if (CallMap.count(MatchingCall))
3719 CallClone = CallMap[MatchingCall];
3720 // Updates the call in the list.
3721 MatchingCall = CallClone;
3722 }
3723 };
3724
3725 // Keep track of the clones of callsite Node that need to be assigned to
3726 // function clones. This list may be expanded in the loop body below if we
3727 // find additional cloning is required.
3728 std::deque<ContextNode *> ClonesWorklist;
3729 // Ignore original Node if we moved all of its contexts to clones.
3730 if (!Node->emptyContextIds())
3731 ClonesWorklist.push_back(Node);
3732 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3733 Node->Clones.end());
3734
3735 // Now walk through all of the clones of this callsite Node that we need,
3736 // and determine the assignment to a corresponding clone of the current
3737 // function (creating new function clones as needed).
3738 unsigned NodeCloneCount = 0;
3739 while (!ClonesWorklist.empty()) {
3740 ContextNode *Clone = ClonesWorklist.front();
3741 ClonesWorklist.pop_front();
3742 NodeCloneCount++;
3743 if (VerifyNodes)
3744 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3745
3746 // Need to create a new function clone if we have more callsite clones
3747 // than existing function clones, which would have been assigned to an
3748 // earlier clone in the list (we assign callsite clones to function
3749 // clones greedily).
3750 if (FuncClonesToCallMap.size() < NodeCloneCount) {
3751 // If this is the first callsite copy, assign to original function.
3752 if (NodeCloneCount == 1) {
3753 // Since FuncClonesToCallMap is empty in this case, no clones have
3754 // been created for this function yet, and no callers should have
3755 // been assigned a function clone for this callee node yet.
3757 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3758 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3759 }));
3760 // Initialize with empty call map, assign Clone to original function
3761 // and its callers, and skip to the next clone.
3762 FuncClonesToCallMap[OrigFunc] = {};
3763 AssignCallsiteCloneToFuncClone(
3764 OrigFunc, Call, Clone,
3765 AllocationCallToContextNodeMap.count(Call));
3766 for (auto &CE : Clone->CallerEdges) {
3767 // Ignore any caller that does not have a recorded callsite Call.
3768 if (!CE->Caller->hasCall())
3769 continue;
3770 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3771 }
3772 continue;
3773 }
3774
3775 // First locate which copy of OrigFunc to clone again. If a caller
3776 // of this callsite clone was already assigned to call a particular
3777 // function clone, we need to redirect all of those callers to the
3778 // new function clone, and update their other callees within this
3779 // function.
3780 FuncInfo PreviousAssignedFuncClone;
3781 auto EI = llvm::find_if(
3782 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3783 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3784 });
3785 bool CallerAssignedToCloneOfFunc = false;
3786 if (EI != Clone->CallerEdges.end()) {
3787 const std::shared_ptr<ContextEdge> &Edge = *EI;
3788 PreviousAssignedFuncClone =
3789 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3790 CallerAssignedToCloneOfFunc = true;
3791 }
3792
3793 // Clone function and save it along with the CallInfo map created
3794 // during cloning in the FuncClonesToCallMap.
3795 std::map<CallInfo, CallInfo> NewCallMap;
3796 unsigned CloneNo = FuncClonesToCallMap.size();
3797 assert(CloneNo > 0 && "Clone 0 is the original function, which "
3798 "should already exist in the map");
3799 FuncInfo NewFuncClone = cloneFunctionForCallsite(
3800 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3801 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3802 FunctionClonesAnalysis++;
3803 Changed = true;
3804
3805 // If no caller callsites were already assigned to a clone of this
3806 // function, we can simply assign this clone to the new func clone
3807 // and update all callers to it, then skip to the next clone.
3808 if (!CallerAssignedToCloneOfFunc) {
3809 AssignCallsiteCloneToFuncClone(
3810 NewFuncClone, Call, Clone,
3811 AllocationCallToContextNodeMap.count(Call));
3812 for (auto &CE : Clone->CallerEdges) {
3813 // Ignore any caller that does not have a recorded callsite Call.
3814 if (!CE->Caller->hasCall())
3815 continue;
3816 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3817 }
3818 continue;
3819 }
3820
3821 // We may need to do additional node cloning in this case.
3822 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3823 // that were previously assigned to call PreviousAssignedFuncClone,
3824 // to record that they now call NewFuncClone.
3825 // The none type edge removal may remove some of this Clone's caller
3826 // edges, if it is reached via another of its caller's callees.
3827 // Iterate over a copy and skip any that were removed.
3828 auto CallerEdges = Clone->CallerEdges;
3829 for (auto CE : CallerEdges) {
3830 // Skip any that have been removed on an earlier iteration.
3831 if (CE->isRemoved()) {
3832 assert(!is_contained(Clone->CallerEdges, CE));
3833 continue;
3834 }
3835 assert(CE);
3836 // Ignore any caller that does not have a recorded callsite Call.
3837 if (!CE->Caller->hasCall())
3838 continue;
3839
3840 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3841 // We subsequently fall through to later handling that
3842 // will perform any additional cloning required for
3843 // callers that were calling other function clones.
3844 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3845 PreviousAssignedFuncClone)
3846 continue;
3847
3848 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3849
3850 // If we are cloning a function that was already assigned to some
3851 // callers, then essentially we are creating new callsite clones
3852 // of the other callsites in that function that are reached by those
3853 // callers. Clone the other callees of the current callsite's caller
3854 // that were already assigned to PreviousAssignedFuncClone
3855 // accordingly. This is important since we subsequently update the
3856 // calls from the nodes in the graph and their assignments to callee
3857 // functions recorded in CallsiteToCalleeFuncCloneMap.
3858 // The none type edge removal may remove some of this caller's
3859 // callee edges, if it is reached via another of its callees.
3860 // Iterate over a copy and skip any that were removed.
3861 auto CalleeEdges = CE->Caller->CalleeEdges;
3862 for (auto CalleeEdge : CalleeEdges) {
3863 // Skip any that have been removed on an earlier iteration when
3864 // cleaning up newly None type callee edges.
3865 if (CalleeEdge->isRemoved()) {
3866 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
3867 continue;
3868 }
3869 assert(CalleeEdge);
3870 ContextNode *Callee = CalleeEdge->Callee;
3871 // Skip the current callsite, we are looking for other
3872 // callsites Caller calls, as well as any that does not have a
3873 // recorded callsite Call.
3874 if (Callee == Clone || !Callee->hasCall())
3875 continue;
3876 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3877 removeNoneTypeCalleeEdges(NewClone);
3878 // Moving the edge may have resulted in some none type
3879 // callee edges on the original Callee.
3880 removeNoneTypeCalleeEdges(Callee);
3881 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3882 // If the Callee node was already assigned to call a specific
3883 // function version, make sure its new clone is assigned to call
3884 // that same function clone.
3885 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3886 RecordCalleeFuncOfCallsite(
3887 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3888 // Update NewClone with the new Call clone of this callsite's Call
3889 // created for the new function clone created earlier.
3890 // Recall that we have already ensured when building the graph
3891 // that each caller can only call callsites within the same
3892 // function, so we are guaranteed that Callee Call is in the
3893 // current OrigFunc.
3894 // CallMap is set up as indexed by original Call at clone 0.
3895 CallInfo OrigCall(Callee->getOrigNode()->Call);
3896 OrigCall.setCloneNo(0);
3897 std::map<CallInfo, CallInfo> &CallMap =
3898 FuncClonesToCallMap[NewFuncClone];
3899 assert(CallMap.count(OrigCall));
3900 CallInfo NewCall(CallMap[OrigCall]);
3901 assert(NewCall);
3902 NewClone->setCall(NewCall);
3903 // Need to do the same for all matching calls.
3904 for (auto &MatchingCall : NewClone->MatchingCalls) {
3905 CallInfo OrigMatchingCall(MatchingCall);
3906 OrigMatchingCall.setCloneNo(0);
3907 assert(CallMap.count(OrigMatchingCall));
3908 CallInfo NewCall(CallMap[OrigMatchingCall]);
3909 assert(NewCall);
3910 // Updates the call in the list.
3911 MatchingCall = NewCall;
3912 }
3913 }
3914 }
3915 // Fall through to handling below to perform the recording of the
3916 // function for this callsite clone. This enables handling of cases
3917 // where the callers were assigned to different clones of a function.
3918 }
3919
3920 // See if we can use existing function clone. Walk through
3921 // all caller edges to see if any have already been assigned to
3922 // a clone of this callsite's function. If we can use it, do so. If not,
3923 // because that function clone is already assigned to a different clone
3924 // of this callsite, then we need to clone again.
3925 // Basically, this checking is needed to handle the case where different
3926 // caller functions/callsites may need versions of this function
3927 // containing different mixes of callsite clones across the different
3928 // callsites within the function. If that happens, we need to create
3929 // additional function clones to handle the various combinations.
3930 //
3931 // Keep track of any new clones of this callsite created by the
3932 // following loop, as well as any existing clone that we decided to
3933 // assign this clone to.
3934 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3935 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3936 // Iterate over a copy of Clone's caller edges, since we may need to
3937 // remove edges in the moveEdgeTo* methods, and this simplifies the
3938 // handling and makes it less error-prone.
3939 auto CloneCallerEdges = Clone->CallerEdges;
3940 for (auto &Edge : CloneCallerEdges) {
3941 // Ignore any caller that does not have a recorded callsite Call.
3942 if (!Edge->Caller->hasCall())
3943 continue;
3944 // If this caller already assigned to call a version of OrigFunc, need
3945 // to ensure we can assign this callsite clone to that function clone.
3946 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3947 FuncInfo FuncCloneCalledByCaller =
3948 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3949 // First we need to confirm that this function clone is available
3950 // for use by this callsite node clone.
3951 //
3952 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3953 // its callsite clones, one of those callsite clones X could have
3954 // been assigned to the same function clone called by Edge's caller
3955 // - if Edge's caller calls another callsite within Node's original
3956 // function, and that callsite has another caller reaching clone X.
3957 // We need to clone Node again in this case.
3958 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3959 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3960 Clone) ||
3961 // Detect when we have multiple callers of this callsite that
3962 // have already been assigned to specific, and different, clones
3963 // of OrigFunc (due to other unrelated callsites in Func they
3964 // reach via call contexts). Is this Clone of callsite Node
3965 // assigned to a different clone of OrigFunc? If so, clone Node
3966 // again.
3967 (FuncCloneAssignedToCurCallsiteClone &&
3968 FuncCloneAssignedToCurCallsiteClone !=
3969 FuncCloneCalledByCaller)) {
3970 // We need to use a different newly created callsite clone, in
3971 // order to assign it to another new function clone on a
3972 // subsequent iteration over the Clones array (adjusted below).
3973 // Note we specifically do not reset the
3974 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3975 // when this new clone is processed later we know which version of
3976 // the function to copy (so that other callsite clones we have
3977 // assigned to that function clone are properly cloned over). See
3978 // comments in the function cloning handling earlier.
3979
3980 // Check if we already have cloned this callsite again while
3981 // walking through caller edges, for a caller calling the same
3982 // function clone. If so, we can move this edge to that new clone
3983 // rather than creating yet another new clone.
3984 if (FuncCloneToNewCallsiteCloneMap.count(
3985 FuncCloneCalledByCaller)) {
3986 ContextNode *NewClone =
3987 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3988 moveEdgeToExistingCalleeClone(Edge, NewClone);
3989 // Cleanup any none type edges cloned over.
3990 removeNoneTypeCalleeEdges(NewClone);
3991 } else {
3992 // Create a new callsite clone.
3993 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
3994 removeNoneTypeCalleeEdges(NewClone);
3995 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3996 NewClone;
3997 // Add to list of clones and process later.
3998 ClonesWorklist.push_back(NewClone);
3999 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4000 }
4001 // Moving the caller edge may have resulted in some none type
4002 // callee edges.
4003 removeNoneTypeCalleeEdges(Clone);
4004 // We will handle the newly created callsite clone in a subsequent
4005 // iteration over this Node's Clones.
4006 continue;
4007 }
4008
4009 // Otherwise, we can use the function clone already assigned to this
4010 // caller.
4011 if (!FuncCloneAssignedToCurCallsiteClone) {
4012 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4013 // Assign Clone to FuncCloneCalledByCaller
4014 AssignCallsiteCloneToFuncClone(
4015 FuncCloneCalledByCaller, Call, Clone,
4016 AllocationCallToContextNodeMap.count(Call));
4017 } else
4018 // Don't need to do anything - callsite is already calling this
4019 // function clone.
4020 assert(FuncCloneAssignedToCurCallsiteClone ==
4021 FuncCloneCalledByCaller);
4022
4023 } else {
4024 // We have not already assigned this caller to a version of
4025 // OrigFunc. Do the assignment now.
4026
4027 // First check if we have already assigned this callsite clone to a
4028 // clone of OrigFunc for another caller during this iteration over
4029 // its caller edges.
4030 if (!FuncCloneAssignedToCurCallsiteClone) {
4031 // Find first function in FuncClonesToCallMap without an assigned
4032 // clone of this callsite Node. We should always have one
4033 // available at this point due to the earlier cloning when the
4034 // FuncClonesToCallMap size was smaller than the clone number.
4035 for (auto &CF : FuncClonesToCallMap) {
4036 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4037 FuncCloneAssignedToCurCallsiteClone = CF.first;
4038 break;
4039 }
4040 }
4041 assert(FuncCloneAssignedToCurCallsiteClone);
4042 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4043 AssignCallsiteCloneToFuncClone(
4044 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4045 AllocationCallToContextNodeMap.count(Call));
4046 } else
4047 assert(FuncCloneToCurNodeCloneMap
4048 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4049 // Update callers to record function version called.
4050 RecordCalleeFuncOfCallsite(Edge->Caller,
4051 FuncCloneAssignedToCurCallsiteClone);
4052 }
4053 }
4054 }
4055 if (VerifyCCG) {
4056 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4057 for (const auto &PE : Node->CalleeEdges)
4058 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4059 for (const auto &CE : Node->CallerEdges)
4060 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4061 for (auto *Clone : Node->Clones) {
4062 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4063 for (const auto &PE : Clone->CalleeEdges)
4064 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4065 for (const auto &CE : Clone->CallerEdges)
4066 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4067 }
4068 }
4069 }
4070 }
4071
4072 uint8_t BothTypes =
4074
4075 auto UpdateCalls = [&](ContextNode *Node,
4077 auto &&UpdateCalls) {
4078 auto Inserted = Visited.insert(Node);
4079 if (!Inserted.second)
4080 return;
4081
4082 for (auto *Clone : Node->Clones)
4083 UpdateCalls(Clone, Visited, UpdateCalls);
4084
4085 for (auto &Edge : Node->CallerEdges)
4086 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4087
4088 // Skip if either no call to update, or if we ended up with no context ids
4089 // (we moved all edges onto other clones).
4090 if (!Node->hasCall() || Node->emptyContextIds())
4091 return;
4092
4093 if (Node->IsAllocation) {
4094 auto AT = allocTypeToUse(Node->AllocTypes);
4095 // If the allocation type is ambiguous, and more aggressive hinting
4096 // has been enabled via the MinClonedColdBytePercent flag, see if this
4097 // allocation should be hinted cold anyway because its fraction cold bytes
4098 // allocated is at least the given threshold.
4099 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4100 !ContextIdToContextSizeInfos.empty()) {
4101 uint64_t TotalCold = 0;
4102 uint64_t Total = 0;
4103 for (auto Id : Node->getContextIds()) {
4104 auto TypeI = ContextIdToAllocationType.find(Id);
4105 assert(TypeI != ContextIdToAllocationType.end());
4106 auto CSI = ContextIdToContextSizeInfos.find(Id);
4107 if (CSI != ContextIdToContextSizeInfos.end()) {
4108 for (auto &Info : CSI->second) {
4109 Total += Info.TotalSize;
4110 if (TypeI->second == AllocationType::Cold)
4111 TotalCold += Info.TotalSize;
4112 }
4113 }
4114 }
4115 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4117 }
4118 updateAllocationCall(Node->Call, AT);
4119 assert(Node->MatchingCalls.empty());
4120 return;
4121 }
4122
4123 if (!CallsiteToCalleeFuncCloneMap.count(Node))
4124 return;
4125
4126 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4127 updateCall(Node->Call, CalleeFunc);
4128 // Update all the matching calls as well.
4129 for (auto &Call : Node->MatchingCalls)
4130 updateCall(Call, CalleeFunc);
4131 };
4132
4133 // Performs DFS traversal starting from allocation nodes to update calls to
4134 // reflect cloning decisions recorded earlier. For regular LTO this will
4135 // update the actual calls in the IR to call the appropriate function clone
4136 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
4137 // are recorded in the summary entries.
4139 for (auto &Entry : AllocationCallToContextNodeMap)
4140 UpdateCalls(Entry.second, Visited, UpdateCalls);
4141
4142 return Changed;
4143}
4144
4146 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4148 &FuncToAliasMap) {
4149 // The first "clone" is the original copy, we should only call this if we
4150 // needed to create new clones.
4151 assert(NumClones > 1);
4153 VMaps.reserve(NumClones - 1);
4154 FunctionsClonedThinBackend++;
4155 for (unsigned I = 1; I < NumClones; I++) {
4156 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
4157 auto *NewF = CloneFunction(&F, *VMaps.back());
4158 FunctionClonesThinBackend++;
4159 // Strip memprof and callsite metadata from clone as they are no longer
4160 // needed.
4161 for (auto &BB : *NewF) {
4162 for (auto &Inst : BB) {
4163 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
4164 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
4165 }
4166 }
4167 std::string Name = getMemProfFuncName(F.getName(), I);
4168 auto *PrevF = M.getFunction(Name);
4169 if (PrevF) {
4170 // We might have created this when adjusting callsite in another
4171 // function. It should be a declaration.
4172 assert(PrevF->isDeclaration());
4173 NewF->takeName(PrevF);
4174 PrevF->replaceAllUsesWith(NewF);
4175 PrevF->eraseFromParent();
4176 } else
4177 NewF->setName(Name);
4178 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
4179 << "created clone " << ore::NV("NewFunction", NewF));
4180
4181 // Now handle aliases to this function, and clone those as well.
4182 if (!FuncToAliasMap.count(&F))
4183 continue;
4184 for (auto *A : FuncToAliasMap[&F]) {
4185 std::string Name = getMemProfFuncName(A->getName(), I);
4186 auto *PrevA = M.getNamedAlias(Name);
4187 auto *NewA = GlobalAlias::create(A->getValueType(),
4188 A->getType()->getPointerAddressSpace(),
4189 A->getLinkage(), Name, NewF);
4190 NewA->copyAttributesFrom(A);
4191 if (PrevA) {
4192 // We might have created this when adjusting callsite in another
4193 // function. It should be a declaration.
4194 assert(PrevA->isDeclaration());
4195 NewA->takeName(PrevA);
4196 PrevA->replaceAllUsesWith(NewA);
4197 PrevA->eraseFromParent();
4198 }
4199 }
4200 }
4201 return VMaps;
4202}
4203
4204// Locate the summary for F. This is complicated by the fact that it might
4205// have been internalized or promoted.
4207 const ModuleSummaryIndex *ImportSummary) {
4208 // FIXME: Ideally we would retain the original GUID in some fashion on the
4209 // function (e.g. as metadata), but for now do our best to locate the
4210 // summary without that information.
4211 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
4212 if (!TheFnVI)
4213 // See if theFn was internalized, by checking index directly with
4214 // original name (this avoids the name adjustment done by getGUID() for
4215 // internal symbols).
4216 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
4217 if (TheFnVI)
4218 return TheFnVI;
4219 // Now query with the original name before any promotion was performed.
4220 StringRef OrigName =
4222 std::string OrigId = GlobalValue::getGlobalIdentifier(
4223 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
4224 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
4225 if (TheFnVI)
4226 return TheFnVI;
4227 // Could be a promoted local imported from another module. We need to pass
4228 // down more info here to find the original module id. For now, try with
4229 // the OrigName which might have been stored in the OidGuidMap in the
4230 // index. This would not work if there were same-named locals in multiple
4231 // modules, however.
4232 auto OrigGUID =
4233 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
4234 if (OrigGUID)
4235 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
4236 return TheFnVI;
4237}
4238
4239bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
4240 Module &M) {
4241 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
4242 Symtab = std::make_unique<InstrProfSymtab>();
4243 // Don't add canonical names, to avoid multiple functions to the symtab
4244 // when they both have the same root name with "." suffixes stripped.
4245 // If we pick the wrong one then this could lead to incorrect ICP and calling
4246 // a memprof clone that we don't actually create (resulting in linker unsats).
4247 // What this means is that the GUID of the function (or its PGOFuncName
4248 // metadata) *must* match that in the VP metadata to allow promotion.
4249 // In practice this should not be a limitation, since local functions should
4250 // have PGOFuncName metadata and global function names shouldn't need any
4251 // special handling (they should not get the ".llvm.*" suffix that the
4252 // canonicalization handling is attempting to strip).
4253 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
4254 std::string SymtabFailure = toString(std::move(E));
4255 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
4256 return false;
4257 }
4258 return true;
4259}
4260
4261bool MemProfContextDisambiguation::applyImport(Module &M) {
4262 assert(ImportSummary);
4263 bool Changed = false;
4264
4265 // We also need to clone any aliases that reference cloned functions, because
4266 // the modified callsites may invoke via the alias. Keep track of the aliases
4267 // for each function.
4268 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4269 FuncToAliasMap;
4270 for (auto &A : M.aliases()) {
4271 auto *Aliasee = A.getAliaseeObject();
4272 if (auto *F = dyn_cast<Function>(Aliasee))
4273 FuncToAliasMap[F].insert(&A);
4274 }
4275
4276 if (!initializeIndirectCallPromotionInfo(M))
4277 return false;
4278
4279 for (auto &F : M) {
4280 if (F.isDeclaration() || isMemProfClone(F))
4281 continue;
4282
4284
4286 bool ClonesCreated = false;
4287 unsigned NumClonesCreated = 0;
4288 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
4289 // We should at least have version 0 which is the original copy.
4290 assert(NumClones > 0);
4291 // If only one copy needed use original.
4292 if (NumClones == 1)
4293 return;
4294 // If we already performed cloning of this function, confirm that the
4295 // requested number of clones matches (the thin link should ensure the
4296 // number of clones for each constituent callsite is consistent within
4297 // each function), before returning.
4298 if (ClonesCreated) {
4299 assert(NumClonesCreated == NumClones);
4300 return;
4301 }
4302 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
4303 // The first "clone" is the original copy, which doesn't have a VMap.
4304 assert(VMaps.size() == NumClones - 1);
4305 Changed = true;
4306 ClonesCreated = true;
4307 NumClonesCreated = NumClones;
4308 };
4309
4310 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
4311 Function *CalledFunction) {
4312 // Perform cloning if not yet done.
4313 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
4314
4315 assert(!isMemProfClone(*CalledFunction));
4316
4317 // Update the calls per the summary info.
4318 // Save orig name since it gets updated in the first iteration
4319 // below.
4320 auto CalleeOrigName = CalledFunction->getName();
4321 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
4322 // Do nothing if this version calls the original version of its
4323 // callee.
4324 if (!StackNode.Clones[J])
4325 continue;
4326 auto NewF = M.getOrInsertFunction(
4327 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
4328 CalledFunction->getFunctionType());
4329 CallBase *CBClone;
4330 // Copy 0 is the original function.
4331 if (!J)
4332 CBClone = CB;
4333 else
4334 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4335 CBClone->setCalledFunction(NewF);
4336 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4337 << ore::NV("Call", CBClone) << " in clone "
4338 << ore::NV("Caller", CBClone->getFunction())
4339 << " assigned to call function clone "
4340 << ore::NV("Callee", NewF.getCallee()));
4341 }
4342 };
4343
4344 // Locate the summary for F.
4345 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
4346 // If not found, this could be an imported local (see comment in
4347 // findValueInfoForFunc). Skip for now as it will be cloned in its original
4348 // module (where it would have been promoted to global scope so should
4349 // satisfy any reference in this module).
4350 if (!TheFnVI)
4351 continue;
4352
4353 auto *GVSummary =
4354 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
4355 if (!GVSummary) {
4356 // Must have been imported, use the summary which matches the definition。
4357 // (might be multiple if this was a linkonce_odr).
4358 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
4359 assert(SrcModuleMD &&
4360 "enable-import-metadata is needed to emit thinlto_src_module");
4361 StringRef SrcModule =
4362 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
4363 for (auto &GVS : TheFnVI.getSummaryList()) {
4364 if (GVS->modulePath() == SrcModule) {
4365 GVSummary = GVS.get();
4366 break;
4367 }
4368 }
4369 assert(GVSummary && GVSummary->modulePath() == SrcModule);
4370 }
4371
4372 // If this was an imported alias skip it as we won't have the function
4373 // summary, and it should be cloned in the original module.
4374 if (isa<AliasSummary>(GVSummary))
4375 continue;
4376
4377 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
4378
4379 if (FS->allocs().empty() && FS->callsites().empty())
4380 continue;
4381
4382 auto SI = FS->callsites().begin();
4383 auto AI = FS->allocs().begin();
4384
4385 // To handle callsite infos synthesized for tail calls which have missing
4386 // frames in the profiled context, map callee VI to the synthesized callsite
4387 // info.
4388 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
4389 // Iterate the callsites for this function in reverse, since we place all
4390 // those synthesized for tail calls at the end.
4391 for (auto CallsiteIt = FS->callsites().rbegin();
4392 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
4393 auto &Callsite = *CallsiteIt;
4394 // Stop as soon as we see a non-synthesized callsite info (see comment
4395 // above loop). All the entries added for discovered tail calls have empty
4396 // stack ids.
4397 if (!Callsite.StackIdIndices.empty())
4398 break;
4399 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
4400 }
4401
4402 // Keeps track of needed ICP for the function.
4403 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
4404
4405 // Assume for now that the instructions are in the exact same order
4406 // as when the summary was created, but confirm this is correct by
4407 // matching the stack ids.
4408 for (auto &BB : F) {
4409 for (auto &I : BB) {
4410 auto *CB = dyn_cast<CallBase>(&I);
4411 // Same handling as when creating module summary.
4412 if (!mayHaveMemprofSummary(CB))
4413 continue;
4414
4415 auto *CalledValue = CB->getCalledOperand();
4416 auto *CalledFunction = CB->getCalledFunction();
4417 if (CalledValue && !CalledFunction) {
4418 CalledValue = CalledValue->stripPointerCasts();
4419 // Stripping pointer casts can reveal a called function.
4420 CalledFunction = dyn_cast<Function>(CalledValue);
4421 }
4422 // Check if this is an alias to a function. If so, get the
4423 // called aliasee for the checks below.
4424 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
4425 assert(!CalledFunction &&
4426 "Expected null called function in callsite for alias");
4427 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
4428 }
4429
4431 I.getMetadata(LLVMContext::MD_callsite));
4432 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
4433
4434 // Include allocs that were already assigned a memprof function
4435 // attribute in the statistics.
4436 if (CB->getAttributes().hasFnAttr("memprof")) {
4437 assert(!MemProfMD);
4438 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4439 ? AllocTypeColdThinBackend++
4440 : AllocTypeNotColdThinBackend++;
4441 OrigAllocsThinBackend++;
4442 AllocVersionsThinBackend++;
4443 if (!MaxAllocVersionsThinBackend)
4444 MaxAllocVersionsThinBackend = 1;
4445 continue;
4446 }
4447
4448 if (MemProfMD) {
4449 // Consult the next alloc node.
4450 assert(AI != FS->allocs().end());
4451 auto &AllocNode = *(AI++);
4452
4453 // Sanity check that the MIB stack ids match between the summary and
4454 // instruction metadata.
4455 auto MIBIter = AllocNode.MIBs.begin();
4456 for (auto &MDOp : MemProfMD->operands()) {
4457 assert(MIBIter != AllocNode.MIBs.end());
4458 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
4459 MIBIter->StackIdIndices.begin();
4460 auto *MIBMD = cast<const MDNode>(MDOp);
4461 MDNode *StackMDNode = getMIBStackNode(MIBMD);
4462 assert(StackMDNode);
4463 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
4464 auto ContextIterBegin =
4465 StackContext.beginAfterSharedPrefix(CallsiteContext);
4466 // Skip the checking on the first iteration.
4467 uint64_t LastStackContextId =
4468 (ContextIterBegin != StackContext.end() &&
4469 *ContextIterBegin == 0)
4470 ? 1
4471 : 0;
4472 for (auto ContextIter = ContextIterBegin;
4473 ContextIter != StackContext.end(); ++ContextIter) {
4474 // If this is a direct recursion, simply skip the duplicate
4475 // entries, to be consistent with how the summary ids were
4476 // generated during ModuleSummaryAnalysis.
4477 if (LastStackContextId == *ContextIter)
4478 continue;
4479 LastStackContextId = *ContextIter;
4480 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
4481 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4482 *ContextIter);
4483 StackIdIndexIter++;
4484 }
4485 MIBIter++;
4486 }
4487
4488 // Perform cloning if not yet done.
4489 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
4490
4491 OrigAllocsThinBackend++;
4492 AllocVersionsThinBackend += AllocNode.Versions.size();
4493 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
4494 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
4495
4496 // If there is only one version that means we didn't end up
4497 // considering this function for cloning, and in that case the alloc
4498 // will still be none type or should have gotten the default NotCold.
4499 // Skip that after calling clone helper since that does some sanity
4500 // checks that confirm we haven't decided yet that we need cloning.
4501 // We might have a single version that is cold due to the
4502 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
4503 // case.
4504 if (AllocNode.Versions.size() == 1 &&
4505 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
4506 assert((AllocationType)AllocNode.Versions[0] ==
4508 (AllocationType)AllocNode.Versions[0] ==
4510 UnclonableAllocsThinBackend++;
4511 continue;
4512 }
4513
4514 // All versions should have a singular allocation type.
4515 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
4516 return Type == ((uint8_t)AllocationType::NotCold |
4517 (uint8_t)AllocationType::Cold);
4518 }));
4519
4520 // Update the allocation types per the summary info.
4521 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
4522 // Ignore any that didn't get an assigned allocation type.
4523 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
4524 continue;
4525 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
4526 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
4527 : AllocTypeNotColdThinBackend++;
4528 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
4529 auto A = llvm::Attribute::get(F.getContext(), "memprof",
4530 AllocTypeString);
4531 CallBase *CBClone;
4532 // Copy 0 is the original function.
4533 if (!J)
4534 CBClone = CB;
4535 else
4536 // Since VMaps are only created for new clones, we index with
4537 // clone J-1 (J==0 is the original clone and does not have a VMaps
4538 // entry).
4539 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4540 CBClone->addFnAttr(A);
4541 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
4542 << ore::NV("AllocationCall", CBClone) << " in clone "
4543 << ore::NV("Caller", CBClone->getFunction())
4544 << " marked with memprof allocation attribute "
4545 << ore::NV("Attribute", AllocTypeString));
4546 }
4547 } else if (!CallsiteContext.empty()) {
4548 if (!CalledFunction) {
4549#ifndef NDEBUG
4550 // We should have skipped inline assembly calls.
4551 auto *CI = dyn_cast<CallInst>(CB);
4552 assert(!CI || !CI->isInlineAsm());
4553#endif
4554 // We should have skipped direct calls via a Constant.
4555 assert(CalledValue && !isa<Constant>(CalledValue));
4556
4557 // This is an indirect call, see if we have profile information and
4558 // whether any clones were recorded for the profiled targets (that
4559 // we synthesized CallsiteInfo summary records for when building the
4560 // index).
4561 auto NumClones =
4562 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
4563
4564 // Perform cloning if not yet done. This is done here in case
4565 // we don't need to do ICP, but might need to clone this
4566 // function as it is the target of other cloned calls.
4567 if (NumClones)
4568 CloneFuncIfNeeded(NumClones);
4569 }
4570
4571 else {
4572 // Consult the next callsite node.
4573 assert(SI != FS->callsites().end());
4574 auto &StackNode = *(SI++);
4575
4576#ifndef NDEBUG
4577 // Sanity check that the stack ids match between the summary and
4578 // instruction metadata.
4579 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
4580 for (auto StackId : CallsiteContext) {
4581 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
4582 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4583 StackId);
4584 StackIdIndexIter++;
4585 }
4586#endif
4587
4588 CloneCallsite(StackNode, CB, CalledFunction);
4589 }
4590 } else if (CB->isTailCall() && CalledFunction) {
4591 // Locate the synthesized callsite info for the callee VI, if any was
4592 // created, and use that for cloning.
4593 ValueInfo CalleeVI =
4594 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
4595 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
4596 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
4597 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
4598 CloneCallsite(Callsite->second, CB, CalledFunction);
4599 }
4600 }
4601 }
4602 }
4603
4604 // Now do any promotion required for cloning.
4605 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
4606 }
4607
4608 // We skip some of the functions and instructions above, so remove all the
4609 // metadata in a single sweep here.
4610 for (auto &F : M) {
4611 // We can skip memprof clones because createFunctionClones already strips
4612 // the metadata from the newly created clones.
4613 if (F.isDeclaration() || isMemProfClone(F))
4614 continue;
4615 for (auto &BB : F) {
4616 for (auto &I : BB) {
4617 if (!isa<CallBase>(I))
4618 continue;
4619 I.setMetadata(LLVMContext::MD_memprof, nullptr);
4620 I.setMetadata(LLVMContext::MD_callsite, nullptr);
4621 }
4622 }
4623 }
4624
4625 return Changed;
4626}
4627
4628unsigned MemProfContextDisambiguation::recordICPInfo(
4629 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
4631 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
4632 // First see if we have profile information for this indirect call.
4633 uint32_t NumCandidates;
4634 uint64_t TotalCount;
4635 auto CandidateProfileData =
4636 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
4637 NumCandidates);
4638 if (CandidateProfileData.empty())
4639 return 0;
4640
4641 // Iterate through all of the candidate profiled targets along with the
4642 // CallsiteInfo summary records synthesized for them when building the index,
4643 // and see if any are cloned and/or refer to clones.
4644 bool ICPNeeded = false;
4645 unsigned NumClones = 0;
4646 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
4647 for (const auto &Candidate : CandidateProfileData) {
4648#ifndef NDEBUG
4649 auto CalleeValueInfo =
4650#endif
4651 ImportSummary->getValueInfo(Candidate.Value);
4652 // We might not have a ValueInfo if this is a distributed
4653 // ThinLTO backend and decided not to import that function.
4654 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
4655 assert(SI != AllCallsites.end());
4656 auto &StackNode = *(SI++);
4657 // See if any of the clones of the indirect callsite for this
4658 // profiled target should call a cloned version of the profiled
4659 // target. We only need to do the ICP here if so.
4660 ICPNeeded |= llvm::any_of(StackNode.Clones,
4661 [](unsigned CloneNo) { return CloneNo != 0; });
4662 // Every callsite in the same function should have been cloned the same
4663 // number of times.
4664 assert(!NumClones || NumClones == StackNode.Clones.size());
4665 NumClones = StackNode.Clones.size();
4666 }
4667 if (!ICPNeeded)
4668 return NumClones;
4669 // Save information for ICP, which is performed later to avoid messing up the
4670 // current function traversal.
4671 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
4672 TotalCount, CallsiteInfoStartIndex});
4673 return NumClones;
4674}
4675
4676void MemProfContextDisambiguation::performICP(
4677 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
4678 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
4679 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
4681 // Now do any promotion required for cloning. Specifically, for each
4682 // recorded ICP candidate (which was only recorded because one clone of that
4683 // candidate should call a cloned target), we perform ICP (speculative
4684 // devirtualization) for each clone of the callsite, and update its callee
4685 // to the appropriate clone. Note that the ICP compares against the original
4686 // version of the target, which is what is in the vtable.
4687 for (auto &Info : ICallAnalysisInfo) {
4688 auto *CB = Info.CB;
4689 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
4690 auto TotalCount = Info.TotalCount;
4691 unsigned NumPromoted = 0;
4692 unsigned NumClones = 0;
4693
4694 for (auto &Candidate : Info.CandidateProfileData) {
4695 auto &StackNode = AllCallsites[CallsiteIndex++];
4696
4697 // All calls in the same function must have the same number of clones.
4698 assert(!NumClones || NumClones == StackNode.Clones.size());
4699 NumClones = StackNode.Clones.size();
4700
4701 // See if the target is in the module. If it wasn't imported, it is
4702 // possible that this profile could have been collected on a different
4703 // target (or version of the code), and we need to be conservative
4704 // (similar to what is done in the ICP pass).
4705 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
4706 if (TargetFunction == nullptr ||
4707 // Any ThinLTO global dead symbol removal should have already
4708 // occurred, so it should be safe to promote when the target is a
4709 // declaration.
4710 // TODO: Remove internal option once more fully tested.
4712 TargetFunction->isDeclaration())) {
4713 ORE.emit([&]() {
4714 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
4715 << "Memprof cannot promote indirect call: target with md5sum "
4716 << ore::NV("target md5sum", Candidate.Value) << " not found";
4717 });
4718 // FIXME: See if we can use the new declaration importing support to
4719 // at least get the declarations imported for this case. Hot indirect
4720 // targets should have been imported normally, however.
4721 continue;
4722 }
4723
4724 // Check if legal to promote
4725 const char *Reason = nullptr;
4726 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
4727 ORE.emit([&]() {
4728 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
4729 << "Memprof cannot promote indirect call to "
4730 << ore::NV("TargetFunction", TargetFunction)
4731 << " with count of " << ore::NV("TotalCount", TotalCount)
4732 << ": " << Reason;
4733 });
4734 continue;
4735 }
4736
4737 assert(!isMemProfClone(*TargetFunction));
4738
4739 // Handle each call clone, applying ICP so that each clone directly
4740 // calls the specified callee clone, guarded by the appropriate ICP
4741 // check.
4742 CallBase *CBClone = CB;
4743 for (unsigned J = 0; J < NumClones; J++) {
4744 // Copy 0 is the original function.
4745 if (J > 0)
4746 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4747 // We do the promotion using the original name, so that the comparison
4748 // is against the name in the vtable. Then just below, change the new
4749 // direct call to call the cloned function.
4750 auto &DirectCall =
4751 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
4752 TotalCount, isSamplePGO, &ORE);
4753 auto *TargetToUse = TargetFunction;
4754 // Call original if this version calls the original version of its
4755 // callee.
4756 if (StackNode.Clones[J]) {
4757 TargetToUse =
4758 cast<Function>(M.getOrInsertFunction(
4759 getMemProfFuncName(TargetFunction->getName(),
4760 StackNode.Clones[J]),
4761 TargetFunction->getFunctionType())
4762 .getCallee());
4763 }
4764 DirectCall.setCalledFunction(TargetToUse);
4765 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4766 << ore::NV("Call", CBClone) << " in clone "
4767 << ore::NV("Caller", CBClone->getFunction())
4768 << " promoted and assigned to call function clone "
4769 << ore::NV("Callee", TargetToUse));
4770 }
4771
4772 // Update TotalCount (all clones should get same count above)
4773 TotalCount -= Candidate.Count;
4774 NumPromoted++;
4775 }
4776 // Adjust the MD.prof metadata for all clones, now that we have the new
4777 // TotalCount and the number promoted.
4778 CallBase *CBClone = CB;
4779 for (unsigned J = 0; J < NumClones; J++) {
4780 // Copy 0 is the original function.
4781 if (J > 0)
4782 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4783 // First delete the old one.
4784 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
4785 // If all promoted, we don't need the MD.prof metadata.
4786 // Otherwise we need update with the un-promoted records back.
4787 if (TotalCount != 0)
4789 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
4790 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
4791 }
4792 }
4793}
4794
4795template <typename DerivedCCG, typename FuncTy, typename CallTy>
4796bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
4797 if (DumpCCG) {
4798 dbgs() << "CCG before cloning:\n";
4799 dbgs() << *this;
4800 }
4801 if (ExportToDot)
4802 exportToDot("postbuild");
4803
4804 if (VerifyCCG) {
4805 check();
4806 }
4807
4808 identifyClones();
4809
4810 if (VerifyCCG) {
4811 check();
4812 }
4813
4814 if (DumpCCG) {
4815 dbgs() << "CCG after cloning:\n";
4816 dbgs() << *this;
4817 }
4818 if (ExportToDot)
4819 exportToDot("cloned");
4820
4821 bool Changed = assignFunctions();
4822
4823 if (DumpCCG) {
4824 dbgs() << "CCG after assigning function clones:\n";
4825 dbgs() << *this;
4826 }
4827 if (ExportToDot)
4828 exportToDot("clonefuncassign");
4829
4831 printTotalSizes(errs());
4832
4833 return Changed;
4834}
4835
4836bool MemProfContextDisambiguation::processModule(
4837 Module &M,
4839
4840 // If we have an import summary, then the cloning decisions were made during
4841 // the thin link on the index. Apply them and return.
4842 if (ImportSummary)
4843 return applyImport(M);
4844
4845 // TODO: If/when other types of memprof cloning are enabled beyond just for
4846 // hot and cold, we will need to change this to individually control the
4847 // AllocationType passed to addStackNodesForMIB during CCG construction.
4848 // Note that we specifically check this after applying imports above, so that
4849 // the option isn't needed to be passed to distributed ThinLTO backend
4850 // clang processes, which won't necessarily have visibility into the linker
4851 // dependences. Instead the information is communicated from the LTO link to
4852 // the backends via the combined summary index.
4853 if (!SupportsHotColdNew)
4854 return false;
4855
4856 ModuleCallsiteContextGraph CCG(M, OREGetter);
4857 return CCG.process();
4858}
4859
4861 const ModuleSummaryIndex *Summary, bool isSamplePGO)
4862 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
4863 if (ImportSummary) {
4864 // The MemProfImportSummary should only be used for testing ThinLTO
4865 // distributed backend handling via opt, in which case we don't have a
4866 // summary from the pass pipeline.
4868 return;
4869 }
4870 if (MemProfImportSummary.empty())
4871 return;
4872
4873 auto ReadSummaryFile =
4875 if (!ReadSummaryFile) {
4876 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
4877 "Error loading file '" + MemProfImportSummary +
4878 "': ");
4879 return;
4880 }
4881 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
4882 if (!ImportSummaryForTestingOrErr) {
4883 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
4884 "Error parsing file '" + MemProfImportSummary +
4885 "': ");
4886 return;
4887 }
4888 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
4889 ImportSummary = ImportSummaryForTesting.get();
4890}
4891
4894 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
4895 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
4897 };
4898 if (!processModule(M, OREGetter))
4899 return PreservedAnalyses::all();
4900 return PreservedAnalyses::none();
4901}
4902
4906 isPrevailing) {
4907 // TODO: If/when other types of memprof cloning are enabled beyond just for
4908 // hot and cold, we will need to change this to individually control the
4909 // AllocationType passed to addStackNodesForMIB during CCG construction.
4910 // The index was set from the option, so these should be in sync.
4911 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
4912 if (!SupportsHotColdNew)
4913 return;
4914
4915 IndexCallsiteContextGraph CCG(Index, isPrevailing);
4916 CCG.process();
4917}
aarch64 promote const
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static bool isMemProfClone(const Function &F)
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
cl::opt< unsigned > MinClonedColdBytePercent
static std::string getAllocTypeString(uint8_t AllocTypes)
cl::opt< bool > MemProfReportHintedSizes
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(false), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
void print(OutputBuffer &OB) const
DomTreeNode::const_iterator end() const
Definition: LoopUnroll.cpp:249
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1474
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1380
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:557
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:410
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:596
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:184
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
Metadata node.
Definition: Metadata.h:1073
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition: DenseSet.h:90
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:99
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:211
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1301
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:43
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:83
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
Definition: SetOperations.h:93
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:303
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1231
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
const char * toString(DWARFSectionKind Kind)
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
static SimpleType getSimplifiedValue(IndexCall &Val)
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition: Casting.h:34