LLVM 17.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
36#include "llvm/IR/Constants.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
45#include "llvm/Transforms/IPO.h"
47#include <sstream>
48#include <vector>
49using namespace llvm;
50using namespace llvm::memprof;
51
52#define DEBUG_TYPE "memprof-context-disambiguation"
53
54STATISTIC(FunctionClonesAnalysis,
55 "Number of function clones created during whole program analysis");
56STATISTIC(FunctionClonesThinBackend,
57 "Number of function clones created during ThinLTO backend");
58STATISTIC(FunctionsClonedThinBackend,
59 "Number of functions that had clones created during ThinLTO backend");
60STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
61 "cloned) during whole program analysis");
62STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
63 "during whole program analysis");
64STATISTIC(AllocTypeNotColdThinBackend,
65 "Number of not cold static allocations (possibly cloned) during "
66 "ThinLTO backend");
67STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
68 "(possibly cloned) during ThinLTO backend");
69STATISTIC(OrigAllocsThinBackend,
70 "Number of original (not cloned) allocations with memprof profiles "
71 "during ThinLTO backend");
73 AllocVersionsThinBackend,
74 "Number of allocation versions (including clones) during ThinLTO backend");
75STATISTIC(MaxAllocVersionsThinBackend,
76 "Maximum number of allocation versions created for an original "
77 "allocation during ThinLTO backend");
78STATISTIC(UnclonableAllocsThinBackend,
79 "Number of unclonable ambigous allocations during ThinLTO backend");
80
82 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
83 cl::value_desc("filename"),
84 cl::desc("Specify the path prefix of the MemProf dot files."));
85
86static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
88 cl::desc("Export graph to dot files."));
89
90static cl::opt<bool>
91 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
92 cl::desc("Dump CallingContextGraph to stdout after each stage."));
93
94static cl::opt<bool>
95 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
96 cl::desc("Perform verification checks on CallingContextGraph."));
97
98static cl::opt<bool>
99 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
100 cl::desc("Perform frequent verification checks on nodes."));
101
103 "memprof-import-summary",
104 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
105 cl::Hidden);
106
107// Indicate we are linking with an allocator that supports hot/cold operator
108// new interfaces.
110 "supports-hot-cold-new", cl::init(false), cl::Hidden,
111 cl::desc("Linking with hot/cold operator new interfaces"));
112
113/// CRTP base for graphs built from either IR or ThinLTO summary index.
114///
115/// The graph represents the call contexts in all memprof metadata on allocation
116/// calls, with nodes for the allocations themselves, as well as for the calls
117/// in each context. The graph is initially built from the allocation memprof
118/// metadata (or summary) MIBs. It is then updated to match calls with callsite
119/// metadata onto the nodes, updating it to reflect any inlining performed on
120/// those calls.
121///
122/// Each MIB (representing an allocation's call context with allocation
123/// behavior) is assigned a unique context id during the graph build. The edges
124/// and nodes in the graph are decorated with the context ids they carry. This
125/// is used to correctly update the graph when cloning is performed so that we
126/// can uniquify the context for a single (possibly cloned) allocation.
127template <typename DerivedCCG, typename FuncTy, typename CallTy>
129public:
133
134 /// Main entry point to perform analysis and transformations on graph.
135 bool process();
136
137 /// Perform cloning on the graph necessary to uniquely identify the allocation
138 /// behavior of an allocation based on its context.
140
141 /// Assign callsite clones to functions, cloning functions as needed to
142 /// accommodate the combinations of their callsite clones reached by callers.
143 /// For regular LTO this clones functions and callsites in the IR, but for
144 /// ThinLTO the cloning decisions are noted in the summaries and later applied
145 /// in applyImport.
147
148 void dump() const;
149 void print(raw_ostream &OS) const;
150
152 const CallsiteContextGraph &CCG) {
153 CCG.print(OS);
154 return OS;
155 }
156
157 friend struct GraphTraits<
158 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
159 friend struct DOTGraphTraits<
160 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
161
162 void exportToDot(std::string Label) const;
163
164 /// Represents a function clone via FuncTy pointer and clone number pair.
165 struct FuncInfo final
166 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
167 using Base = std::pair<FuncTy *, unsigned>;
168 FuncInfo(const Base &B) : Base(B) {}
169 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
170 explicit operator bool() const { return this->first != nullptr; }
171 FuncTy *func() const { return this->first; }
172 unsigned cloneNo() const { return this->second; }
173 };
174
175 /// Represents a callsite clone via CallTy and clone number pair.
176 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
177 using Base = std::pair<CallTy, unsigned>;
178 CallInfo(const Base &B) : Base(B) {}
179 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
180 : Base(Call, CloneNo) {}
181 explicit operator bool() const { return (bool)this->first; }
182 CallTy call() const { return this->first; }
183 unsigned cloneNo() const { return this->second; }
184 void setCloneNo(unsigned N) { this->second = N; }
185 void print(raw_ostream &OS) const {
186 if (!operator bool()) {
187 assert(!cloneNo());
188 OS << "null Call";
189 return;
190 }
191 call()->print(OS);
192 OS << "\t(clone " << cloneNo() << ")";
193 }
194 void dump() const {
195 print(dbgs());
196 dbgs() << "\n";
197 }
199 Call.print(OS);
200 return OS;
201 }
202 };
203
204 struct ContextEdge;
205
206 /// Node in the Callsite Context Graph
207 struct ContextNode {
208 // Keep this for now since in the IR case where we have an Instruction* it
209 // is not as immediately discoverable. Used for printing richer information
210 // when dumping graph.
212
213 // Keeps track of when the Call was reset to null because there was
214 // recursion.
215 bool Recursive = false;
216
217 // The corresponding allocation or interior call.
219
220 // For alloc nodes this is a unique id assigned when constructed, and for
221 // callsite stack nodes it is the original stack id when the node is
222 // constructed from the memprof MIB metadata on the alloc nodes. Note that
223 // this is only used when matching callsite metadata onto the stack nodes
224 // created when processing the allocation memprof MIBs, and for labeling
225 // nodes in the dot graph. Therefore we don't bother to assign a value for
226 // clones.
228
229 // This will be formed by ORing together the AllocationType enum values
230 // for contexts including this node.
231 uint8_t AllocTypes = 0;
232
233 // Edges to all callees in the profiled call stacks.
234 // TODO: Should this be a map (from Callee node) for more efficient lookup?
235 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
236
237 // Edges to all callers in the profiled call stacks.
238 // TODO: Should this be a map (from Caller node) for more efficient lookup?
239 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
240
241 // The set of IDs for contexts including this node.
243
244 // List of clones of this ContextNode, initially empty.
245 std::vector<ContextNode *> Clones;
246
247 // If a clone, points to the original uncloned node.
249
251
254
255 void addClone(ContextNode *Clone) {
256 if (CloneOf) {
257 CloneOf->Clones.push_back(Clone);
258 Clone->CloneOf = CloneOf;
259 } else {
260 Clones.push_back(Clone);
261 assert(!Clone->CloneOf);
262 Clone->CloneOf = this;
263 }
264 }
265
267 if (!CloneOf)
268 return this;
269 return CloneOf;
270 }
271
273 unsigned int ContextId);
274
277 void eraseCalleeEdge(const ContextEdge *Edge);
278 void eraseCallerEdge(const ContextEdge *Edge);
279
280 void setCall(CallInfo C) { Call = C; }
281
282 bool hasCall() const { return (bool)Call.call(); }
283
284 void printCall(raw_ostream &OS) const { Call.print(OS); }
285
286 // True if this node was effectively removed from the graph, in which case
287 // its context id set, caller edges, and callee edges should all be empty.
288 bool isRemoved() const {
290 (CalleeEdges.empty() && CallerEdges.empty()));
291 return ContextIds.empty();
292 }
293
294 void dump() const;
295 void print(raw_ostream &OS) const;
296
298 Node.print(OS);
299 return OS;
300 }
301 };
302
303 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
304 /// callee.
305 struct ContextEdge {
308
309 // This will be formed by ORing together the AllocationType enum values
310 // for contexts including this edge.
311 uint8_t AllocTypes = 0;
312
313 // The set of IDs for contexts including this edge.
315
320
322
323 void dump() const;
324 void print(raw_ostream &OS) const;
325
327 Edge.print(OS);
328 return OS;
329 }
330 };
331
332 /// Helper to remove callee edges that have allocation type None (due to not
333 /// carrying any context ids) after transformations.
335
336protected:
337 /// Get a list of nodes corresponding to the stack ids in the given callsite
338 /// context.
339 template <class NodeT, class IteratorT>
340 std::vector<uint64_t>
342
343 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
344 /// metadata (or summary).
345 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
346
347 /// Adds nodes for the given MIB stack ids.
348 template <class NodeT, class IteratorT>
350 CallStack<NodeT, IteratorT> &StackContext,
351 CallStack<NodeT, IteratorT> &CallsiteContext,
353
354 /// Matches all callsite metadata (or summary) to the nodes created for
355 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
356 /// inlining performed on those callsite instructions.
358
359 /// Update graph to conservatively handle any callsite stack nodes that target
360 /// multiple different callee target functions.
362
363 /// Save lists of calls with MemProf metadata in each function, for faster
364 /// iteration.
365 std::vector<std::pair<FuncTy *, std::vector<CallInfo>>>
367
368 /// Map from callsite node to the enclosing caller function.
369 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
370
371private:
372 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
373
374 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
375 const FuncTy *, DenseSet<uint32_t>>;
376
377 /// Assigns the given Node to calls at or inlined into the location with
378 /// the Node's stack id, after post order traversing and processing its
379 /// caller nodes. Uses the call information recorded in the given
380 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
381 /// as needed. Called by updateStackNodes which sets up the given
382 /// StackIdToMatchingCalls map.
383 void assignStackNodesPostOrder(
385 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
386
387 /// Duplicates the given set of context ids, updating the provided
388 /// map from each original id with the newly generated context ids,
389 /// and returning the new duplicated id set.
390 DenseSet<uint32_t> duplicateContextIds(
391 const DenseSet<uint32_t> &StackSequenceContextIds,
392 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
393
394 /// Propagates all duplicated context ids across the graph.
395 void propagateDuplicateContextIds(
396 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
397
398 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
399 /// else to its callers. Also updates OrigNode's edges to remove any context
400 /// ids moved to the newly created edge.
401 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
402 bool TowardsCallee);
403
404 /// Get the stack id corresponding to the given Id or Index (for IR this will
405 /// return itself, for a summary index this will return the id recorded in the
406 /// index for that stack id index value).
407 uint64_t getStackId(uint64_t IdOrIndex) const {
408 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
409 }
410
411 /// Returns true if the given call targets the given function.
412 bool calleeMatchesFunc(CallTy Call, const FuncTy *Func) {
413 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(Call, Func);
414 }
415
416 /// Get a list of nodes corresponding to the stack ids in the given
417 /// callsite's context.
418 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
419 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
420 Call);
421 }
422
423 /// Get the last stack id in the context for callsite.
424 uint64_t getLastStackId(CallTy Call) {
425 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
426 }
427
428 /// Update the allocation call to record type of allocated memory.
429 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
430 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
431 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
432 }
433
434 /// Update non-allocation call to invoke (possibly cloned) function
435 /// CalleeFunc.
436 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
437 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
438 }
439
440 /// Clone the given function for the given callsite, recording mapping of all
441 /// of the functions tracked calls to their new versions in the CallMap.
442 /// Assigns new clones to clone number CloneNo.
443 FuncInfo cloneFunctionForCallsite(
444 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
445 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
446 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
447 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
448 }
449
450 /// Gets a label to use in the dot graph for the given call clone in the given
451 /// function.
452 std::string getLabel(const FuncTy *Func, const CallTy Call,
453 unsigned CloneNo) const {
454 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
455 }
456
457 /// Helpers to find the node corresponding to the given call or stackid.
458 ContextNode *getNodeForInst(const CallInfo &C);
459 ContextNode *getNodeForAlloc(const CallInfo &C);
460 ContextNode *getNodeForStackId(uint64_t StackId);
461
462 /// Removes the node information recorded for the given call.
463 void unsetNodeForInst(const CallInfo &C);
464
465 /// Computes the alloc type corresponding to the given context ids, by
466 /// unioning their recorded alloc types.
467 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
468
469 /// Returns the alloction type of the intersection of the contexts of two
470 /// nodes (based on their provided context id sets), optimized for the case
471 /// when Node1Ids is smaller than Node2Ids.
472 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
473 const DenseSet<uint32_t> &Node2Ids);
474
475 /// Returns the alloction type of the intersection of the contexts of two
476 /// nodes (based on their provided context id sets).
477 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
478 const DenseSet<uint32_t> &Node2Ids);
479
480 /// Create a clone of Edge's callee and move Edge to that new callee node,
481 /// performing the necessary context id and allocation type updates.
482 /// If callee's caller edge iterator is supplied, it is updated when removing
483 /// the edge from that list.
485 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
486 EdgeIter *CallerEdgeI = nullptr);
487
488 /// Change the callee of Edge to existing callee clone NewCallee, performing
489 /// the necessary context id and allocation type updates.
490 /// If callee's caller edge iterator is supplied, it is updated when removing
491 /// the edge from that list.
492 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
493 ContextNode *NewCallee,
494 EdgeIter *CallerEdgeI = nullptr,
495 bool NewClone = false);
496
497 /// Recursively perform cloning on the graph for the given Node and its
498 /// callers, in order to uniquely identify the allocation behavior of an
499 /// allocation given its context.
502
503 /// Map from each context ID to the AllocationType assigned to that context.
504 std::map<uint32_t, AllocationType> ContextIdToAllocationType;
505
506 /// Identifies the context node created for a stack id when adding the MIB
507 /// contexts to the graph. This is used to locate the context nodes when
508 /// trying to assign the corresponding callsites with those stack ids to these
509 /// nodes.
510 std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
511
512 /// Maps to track the calls to their corresponding nodes in the graph.
513 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
514 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
515
516 /// Owner of all ContextNode unique_ptrs.
517 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
518
519 /// Perform sanity checks on graph when requested.
520 void check() const;
521
522 /// Keeps track of the last unique context id assigned.
523 unsigned int LastContextId = 0;
524};
525
526template <typename DerivedCCG, typename FuncTy, typename CallTy>
529template <typename DerivedCCG, typename FuncTy, typename CallTy>
532template <typename DerivedCCG, typename FuncTy, typename CallTy>
533using FuncInfo =
535template <typename DerivedCCG, typename FuncTy, typename CallTy>
536using CallInfo =
538
539/// CRTP derived class for graphs built from IR (regular LTO).
541 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
542 Instruction *> {
543public:
545 Module &M,
547
548private:
550 Instruction *>;
551
552 uint64_t getStackId(uint64_t IdOrIndex) const;
553 bool calleeMatchesFunc(Instruction *Call, const Function *Func);
554 uint64_t getLastStackId(Instruction *Call);
555 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
556 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
557 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
560 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
561 std::map<CallInfo, CallInfo> &CallMap,
562 std::vector<CallInfo> &CallsWithMetadataInFunc,
563 unsigned CloneNo);
564 std::string getLabel(const Function *Func, const Instruction *Call,
565 unsigned CloneNo) const;
566
567 const Module &Mod;
569};
570
571/// Represents a call in the summary index graph, which can either be an
572/// allocation or an interior callsite node in an allocation's context.
573/// Holds a pointer to the corresponding data structure in the index.
574struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
576 IndexCall(std::nullptr_t) : IndexCall() {}
577 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
578 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
580
581 IndexCall *operator->() { return this; }
582
584
585 void print(raw_ostream &OS) const {
586 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
587 OS << *AI;
588 } else {
589 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
590 assert(CI);
591 OS << *CI;
592 }
593 }
594};
595
596/// CRTP derived class for graphs built from summary index (ThinLTO).
598 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
599 IndexCall> {
600public:
604 isPrevailing);
605
606private:
608 IndexCall>;
609
610 uint64_t getStackId(uint64_t IdOrIndex) const;
611 bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
612 uint64_t getLastStackId(IndexCall &Call);
613 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
614 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
615 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
618 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
619 std::map<CallInfo, CallInfo> &CallMap,
620 std::vector<CallInfo> &CallsWithMetadataInFunc,
621 unsigned CloneNo);
622 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
623 unsigned CloneNo) const;
624
625 // Saves mapping from function summaries containing memprof records back to
626 // its VI, for use in checking and debugging.
627 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
628
630};
631
632namespace llvm {
633template <>
637template <>
640 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
641template <>
643 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
644} // end namespace llvm
645
646namespace {
647
648struct FieldSeparator {
649 bool Skip = true;
650 const char *Sep;
651
652 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
653};
654
655raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
656 if (FS.Skip) {
657 FS.Skip = false;
658 return OS;
659 }
660 return OS << FS.Sep;
661}
662
663// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
664// type we should actually use on the corresponding allocation.
665// If we can't clone a node that has NotCold+Cold alloc type, we will fall
666// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
667// from NotCold.
668AllocationType allocTypeToUse(uint8_t AllocTypes) {
669 assert(AllocTypes != (uint8_t)AllocationType::None);
670 if (AllocTypes ==
671 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
672 return AllocationType::NotCold;
673 else
674 return (AllocationType)AllocTypes;
675}
676
677// Helper to check if the alloc types for all edges recorded in the
678// InAllocTypes vector match the alloc types for all edges in the Edges
679// vector.
680template <typename DerivedCCG, typename FuncTy, typename CallTy>
681bool allocTypesMatch(
682 const std::vector<uint8_t> &InAllocTypes,
683 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
684 &Edges) {
685 return std::equal(
686 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
687 [](const uint8_t &l,
688 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
689 // Can share if one of the edges is None type - don't
690 // care about the type along that edge as it doesn't
691 // exist for those context ids.
692 if (l == (uint8_t)AllocationType::None ||
693 r->AllocTypes == (uint8_t)AllocationType::None)
694 return true;
695 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
696 });
697}
698
699} // end anonymous namespace
700
701template <typename DerivedCCG, typename FuncTy, typename CallTy>
704 const CallInfo &C) {
705 ContextNode *Node = getNodeForAlloc(C);
706 if (Node)
707 return Node;
708
709 auto NonAllocCallNode = NonAllocationCallToContextNodeMap.find(C);
710 if (NonAllocCallNode != NonAllocationCallToContextNodeMap.end()) {
711 return NonAllocCallNode->second;
712 }
713 return nullptr;
714}
715
716template <typename DerivedCCG, typename FuncTy, typename CallTy>
719 const CallInfo &C) {
720 auto AllocCallNode = AllocationCallToContextNodeMap.find(C);
721 if (AllocCallNode != AllocationCallToContextNodeMap.end()) {
722 return AllocCallNode->second;
723 }
724 return nullptr;
725}
726
727template <typename DerivedCCG, typename FuncTy, typename CallTy>
730 uint64_t StackId) {
731 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
732 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
733 return StackEntryNode->second;
734 return nullptr;
735}
736
737template <typename DerivedCCG, typename FuncTy, typename CallTy>
739 const CallInfo &C) {
740 AllocationCallToContextNodeMap.erase(C) ||
741 NonAllocationCallToContextNodeMap.erase(C);
742 assert(!AllocationCallToContextNodeMap.count(C) &&
743 !NonAllocationCallToContextNodeMap.count(C));
744}
745
746template <typename DerivedCCG, typename FuncTy, typename CallTy>
749 unsigned int ContextId) {
750 for (auto &Edge : CallerEdges) {
751 if (Edge->Caller == Caller) {
752 Edge->AllocTypes |= (uint8_t)AllocType;
753 Edge->getContextIds().insert(ContextId);
754 return;
755 }
756 }
757 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
758 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
759 CallerEdges.push_back(Edge);
760 Caller->CalleeEdges.push_back(Edge);
761}
762
763template <typename DerivedCCG, typename FuncTy, typename CallTy>
765 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
766 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
767 auto Edge = *EI;
768 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
769 assert(Edge->ContextIds.empty());
770 Edge->Callee->eraseCallerEdge(Edge.get());
771 EI = Node->CalleeEdges.erase(EI);
772 } else
773 ++EI;
774 }
775}
776
777template <typename DerivedCCG, typename FuncTy, typename CallTy>
781 for (const auto &Edge : CalleeEdges)
782 if (Edge->Callee == Callee)
783 return Edge.get();
784 return nullptr;
785}
786
787template <typename DerivedCCG, typename FuncTy, typename CallTy>
790 findEdgeFromCaller(const ContextNode *Caller) {
791 for (const auto &Edge : CallerEdges)
792 if (Edge->Caller == Caller)
793 return Edge.get();
794 return nullptr;
795}
796
797template <typename DerivedCCG, typename FuncTy, typename CallTy>
799 eraseCalleeEdge(const ContextEdge *Edge) {
800 auto EI =
801 std::find_if(CalleeEdges.begin(), CalleeEdges.end(),
802 [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
803 return CalleeEdge.get() == Edge;
804 });
805 assert(EI != CalleeEdges.end());
806 CalleeEdges.erase(EI);
807}
808
809template <typename DerivedCCG, typename FuncTy, typename CallTy>
811 eraseCallerEdge(const ContextEdge *Edge) {
812 auto EI =
813 std::find_if(CallerEdges.begin(), CallerEdges.end(),
814 [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
815 return CallerEdge.get() == Edge;
816 });
817 assert(EI != CallerEdges.end());
818 CallerEdges.erase(EI);
819}
820
821template <typename DerivedCCG, typename FuncTy, typename CallTy>
823 DenseSet<uint32_t> &ContextIds) {
824 uint8_t BothTypes =
825 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
826 uint8_t AllocType = (uint8_t)AllocationType::None;
827 for (auto Id : ContextIds) {
828 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
829 // Bail early if alloc type reached both, no further refinement.
830 if (AllocType == BothTypes)
831 return AllocType;
832 }
833 return AllocType;
834}
835
836template <typename DerivedCCG, typename FuncTy, typename CallTy>
837uint8_t
839 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
840 uint8_t BothTypes =
841 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
842 uint8_t AllocType = (uint8_t)AllocationType::None;
843 for (auto Id : Node1Ids) {
844 if (!Node2Ids.count(Id))
845 continue;
846 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
847 // Bail early if alloc type reached both, no further refinement.
848 if (AllocType == BothTypes)
849 return AllocType;
850 }
851 return AllocType;
852}
853
854template <typename DerivedCCG, typename FuncTy, typename CallTy>
856 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
857 if (Node1Ids.size() < Node2Ids.size())
858 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
859 else
860 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
861}
862
863template <typename DerivedCCG, typename FuncTy, typename CallTy>
866 CallInfo Call, const FuncTy *F) {
867 assert(!getNodeForAlloc(Call));
868 NodeOwner.push_back(
869 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
870 ContextNode *AllocNode = NodeOwner.back().get();
871 AllocationCallToContextNodeMap[Call] = AllocNode;
872 NodeToCallingFunc[AllocNode] = F;
873 // Use LastContextId as a uniq id for MIB allocation nodes.
874 AllocNode->OrigStackOrAllocId = LastContextId;
875 // Alloc type should be updated as we add in the MIBs. We should assert
876 // afterwards that it is not still None.
877 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
878
879 return AllocNode;
880}
881
882template <typename DerivedCCG, typename FuncTy, typename CallTy>
883template <class NodeT, class IteratorT>
885 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
887 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
888 // is done.
889 if (AllocType == AllocationType::Hot)
890 AllocType = AllocationType::NotCold;
891
892 ContextIdToAllocationType[++LastContextId] = AllocType;
893
894 // Update alloc type and context ids for this MIB.
895 AllocNode->AllocTypes |= (uint8_t)AllocType;
896 AllocNode->ContextIds.insert(LastContextId);
897
898 // Now add or update nodes for each stack id in alloc's context.
899 // Later when processing the stack ids on non-alloc callsites we will adjust
900 // for any inlining in the context.
901 ContextNode *PrevNode = AllocNode;
902 // Look for recursion (direct recursion should have been collapsed by
903 // module summary analysis, here we should just be detecting mutual
904 // recursion). Mark these nodes so we don't try to clone.
905 SmallSet<uint64_t, 8> StackIdSet;
906 // Skip any on the allocation call (inlining).
907 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
908 ContextIter != StackContext.end(); ++ContextIter) {
909 auto StackId = getStackId(*ContextIter);
910 ContextNode *StackNode = getNodeForStackId(StackId);
911 if (!StackNode) {
912 NodeOwner.push_back(
913 std::make_unique<ContextNode>(/*IsAllocation=*/false));
914 StackNode = NodeOwner.back().get();
915 StackEntryIdToContextNodeMap[StackId] = StackNode;
916 StackNode->OrigStackOrAllocId = StackId;
917 }
918 auto Ins = StackIdSet.insert(StackId);
919 if (!Ins.second)
920 StackNode->Recursive = true;
921 StackNode->ContextIds.insert(LastContextId);
922 StackNode->AllocTypes |= (uint8_t)AllocType;
923 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
924 PrevNode = StackNode;
925 }
926}
927
928template <typename DerivedCCG, typename FuncTy, typename CallTy>
931 const DenseSet<uint32_t> &StackSequenceContextIds,
932 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
933 DenseSet<uint32_t> NewContextIds;
934 for (auto OldId : StackSequenceContextIds) {
935 NewContextIds.insert(++LastContextId);
936 OldToNewContextIds[OldId].insert(LastContextId);
937 assert(ContextIdToAllocationType.count(OldId));
938 // The new context has the same allocation type as original.
939 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
940 }
941 return NewContextIds;
942}
943
944template <typename DerivedCCG, typename FuncTy, typename CallTy>
947 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
948 // Build a set of duplicated context ids corresponding to the input id set.
949 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
950 DenseSet<uint32_t> NewIds;
951 for (auto Id : ContextIds)
952 if (auto NewId = OldToNewContextIds.find(Id);
953 NewId != OldToNewContextIds.end())
954 NewIds.insert(NewId->second.begin(), NewId->second.end());
955 return NewIds;
956 };
957
958 // Recursively update context ids sets along caller edges.
959 auto UpdateCallers = [&](ContextNode *Node,
961 auto &&UpdateCallers) -> void {
962 for (const auto &Edge : Node->CallerEdges) {
963 auto Inserted = Visited.insert(Edge.get());
964 if (!Inserted.second)
965 continue;
966 ContextNode *NextNode = Edge->Caller;
967 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
968 // Only need to recursively iterate to NextNode via this caller edge if
969 // it resulted in any added ids to NextNode.
970 if (!NewIdsToAdd.empty()) {
971 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
972 NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
973 UpdateCallers(NextNode, Visited, UpdateCallers);
974 }
975 }
976 };
977
979 for (auto &Entry : AllocationCallToContextNodeMap) {
980 auto *Node = Entry.second;
981 // Update ids on the allocation nodes before calling the recursive
982 // update along caller edges, since this simplifies the logic during
983 // that traversal.
984 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
985 Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
986 UpdateCallers(Node, Visited, UpdateCallers);
987 }
988}
989
990template <typename DerivedCCG, typename FuncTy, typename CallTy>
992 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
993 // Make a copy of the context ids, since this will be adjusted below as they
994 // are moved.
995 DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
996 auto &OrigEdges =
997 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
998 // Increment iterator in loop so that we can remove edges as needed.
999 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1000 auto Edge = *EI;
1001 // Remove any matching context ids from Edge, return set that were found and
1002 // removed, these are the new edge's context ids. Also update the remaining
1003 // (not found ids).
1004 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1005 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1006 NotFoundContextIds);
1007 RemainingContextIds.swap(NotFoundContextIds);
1008 // If no matching context ids for this edge, skip it.
1009 if (NewEdgeContextIds.empty()) {
1010 ++EI;
1011 continue;
1012 }
1013 if (TowardsCallee) {
1014 auto NewEdge = std::make_shared<ContextEdge>(
1015 Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
1016 NewEdgeContextIds);
1017 NewNode->CalleeEdges.push_back(NewEdge);
1018 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1019 } else {
1020 auto NewEdge = std::make_shared<ContextEdge>(
1021 NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
1022 NewEdgeContextIds);
1023 NewNode->CallerEdges.push_back(NewEdge);
1024 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1025 }
1026 // Remove old edge if context ids empty.
1027 if (Edge->getContextIds().empty()) {
1028 if (TowardsCallee) {
1029 Edge->Callee->eraseCallerEdge(Edge.get());
1030 EI = OrigNode->CalleeEdges.erase(EI);
1031 } else {
1032 Edge->Caller->eraseCalleeEdge(Edge.get());
1033 EI = OrigNode->CallerEdges.erase(EI);
1034 }
1035 continue;
1036 }
1037 ++EI;
1038 }
1039}
1040
1041template <typename DerivedCCG, typename FuncTy, typename CallTy>
1045 DenseMap<uint64_t, std::vector<CallContextInfo>>
1046 &StackIdToMatchingCalls) {
1047 auto Inserted = Visited.insert(Node);
1048 if (!Inserted.second)
1049 return;
1050 // Post order traversal. Iterate over a copy since we may add nodes and
1051 // therefore new callers during the recursive call, invalidating any
1052 // iterator over the original edge vector. We don't need to process these
1053 // new nodes as they were already processed on creation.
1054 auto CallerEdges = Node->CallerEdges;
1055 for (auto &Edge : CallerEdges) {
1056 // Skip any that have been removed during the recursion.
1057 if (!Edge)
1058 continue;
1059 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1060 }
1061
1062 // If this node's stack id is in the map, update the graph to contain new
1063 // nodes representing any inlining at interior callsites. Note we move the
1064 // associated context ids over to the new nodes.
1065
1066 // Ignore this node if it is for an allocation or we didn't record any
1067 // stack id lists ending at it.
1068 if (Node->IsAllocation ||
1069 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1070 return;
1071
1072 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1073 // Handle the simple case first. A single call with a single stack id.
1074 // In this case there is no need to create any new context nodes, simply
1075 // assign the context node for stack id to this Call.
1076 if (Calls.size() == 1) {
1077 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1078 if (Ids.size() == 1) {
1079 assert(SavedContextIds.empty());
1080 // It should be this Node
1081 assert(Node == getNodeForStackId(Ids[0]));
1082 if (Node->Recursive)
1083 return;
1084 Node->setCall(Call);
1085 NonAllocationCallToContextNodeMap[Call] = Node;
1087 return;
1088 }
1089 }
1090
1091 // Find the node for the last stack id, which should be the same
1092 // across all calls recorded for this id, and is this node's id.
1093 uint64_t LastId = Node->OrigStackOrAllocId;
1094 ContextNode *LastNode = getNodeForStackId(LastId);
1095 // We should only have kept stack ids that had nodes.
1096 assert(LastNode);
1097
1098 for (unsigned I = 0; I < Calls.size(); I++) {
1099 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1100 // Skip any for which we didn't assign any ids, these don't get a node in
1101 // the graph.
1102 if (SavedContextIds.empty())
1103 continue;
1104
1105 assert(LastId == Ids.back());
1106
1107 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1108 assert(FirstNode);
1109
1110 // Recompute the context ids for this stack id sequence (the
1111 // intersection of the context ids of the corresponding nodes).
1112 // Start with the ids we saved in the map for this call, which could be
1113 // duplicated context ids. We have to recompute as we might have overlap
1114 // overlap between the saved context ids for different last nodes, and
1115 // removed them already during the post order traversal.
1116 set_intersect(SavedContextIds, FirstNode->ContextIds);
1117 ContextNode *PrevNode = nullptr;
1118 for (auto Id : Ids) {
1119 ContextNode *CurNode = getNodeForStackId(Id);
1120 // We should only have kept stack ids that had nodes and weren't
1121 // recursive.
1122 assert(CurNode);
1123 assert(!CurNode->Recursive);
1124 if (!PrevNode) {
1125 PrevNode = CurNode;
1126 continue;
1127 }
1128 auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1129 if (!Edge) {
1130 SavedContextIds.clear();
1131 break;
1132 }
1133 PrevNode = CurNode;
1134 set_intersect(SavedContextIds, Edge->getContextIds());
1135
1136 // If we now have no context ids for clone, skip this call.
1137 if (SavedContextIds.empty())
1138 break;
1139 }
1140 if (SavedContextIds.empty())
1141 continue;
1142
1143 // Create new context node.
1144 NodeOwner.push_back(
1145 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1146 ContextNode *NewNode = NodeOwner.back().get();
1147 NodeToCallingFunc[NewNode] = Func;
1148 NonAllocationCallToContextNodeMap[Call] = NewNode;
1149 NewNode->ContextIds = SavedContextIds;
1150 NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
1151
1152 // Connect to callees of innermost stack frame in inlined call chain.
1153 // This updates context ids for FirstNode's callee's to reflect those
1154 // moved to NewNode.
1155 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
1156
1157 // Connect to callers of outermost stack frame in inlined call chain.
1158 // This updates context ids for FirstNode's caller's to reflect those
1159 // moved to NewNode.
1160 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
1161
1162 // Now we need to remove context ids from edges/nodes between First and
1163 // Last Node.
1164 PrevNode = nullptr;
1165 for (auto Id : Ids) {
1166 ContextNode *CurNode = getNodeForStackId(Id);
1167 // We should only have kept stack ids that had nodes.
1168 assert(CurNode);
1169
1170 // Remove the context ids moved to NewNode from CurNode, and the
1171 // edge from the prior node.
1172 set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1173 if (PrevNode) {
1174 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1175 assert(PrevEdge);
1176 set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1177 if (PrevEdge->getContextIds().empty()) {
1178 PrevNode->eraseCallerEdge(PrevEdge);
1179 CurNode->eraseCalleeEdge(PrevEdge);
1180 }
1181 }
1182 PrevNode = CurNode;
1183 }
1184 }
1185}
1186
1187template <typename DerivedCCG, typename FuncTy, typename CallTy>
1189 // Map of stack id to all calls with that as the last (outermost caller)
1190 // callsite id that has a context node (some might not due to pruning
1191 // performed during matching of the allocation profile contexts).
1192 // The CallContextInfo contains the Call and a list of its stack ids with
1193 // ContextNodes, the function containing Call, and the set of context ids
1194 // the analysis will eventually identify for use in any new node created
1195 // for that callsite.
1197 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1198 for (auto &Call : CallsWithMetadata) {
1199 // Ignore allocations, already handled.
1200 if (AllocationCallToContextNodeMap.count(Call))
1201 continue;
1202 auto StackIdsWithContextNodes =
1203 getStackIdsWithContextNodesForCall(Call.call());
1204 // If there were no nodes created for MIBs on allocs (maybe this was in
1205 // the unambiguous part of the MIB stack that was pruned), ignore.
1206 if (StackIdsWithContextNodes.empty())
1207 continue;
1208 // Otherwise, record this Call along with the list of ids for the last
1209 // (outermost caller) stack id with a node.
1210 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1211 {Call.call(), StackIdsWithContextNodes, Func, {}});
1212 }
1213 }
1214
1215 // First make a pass through all stack ids that correspond to a call,
1216 // as identified in the above loop. Compute the context ids corresponding to
1217 // each of these calls when they correspond to multiple stack ids due to
1218 // due to inlining. Perform any duplication of context ids required when
1219 // there is more than one call with the same stack ids. Their (possibly newly
1220 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1221 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1222 for (auto &It : StackIdToMatchingCalls) {
1223 auto &Calls = It.getSecond();
1224 // Skip single calls with a single stack id. These don't need a new node.
1225 if (Calls.size() == 1) {
1226 auto &Ids = std::get<1>(Calls[0]);
1227 if (Ids.size() == 1)
1228 continue;
1229 }
1230 // In order to do the best and maximal matching of inlined calls to context
1231 // node sequences we will sort the vectors of stack ids in descending order
1232 // of length, and within each length, lexicographically by stack id. The
1233 // latter is so that we can specially handle calls that have identical stack
1234 // id sequences (either due to cloning or artificially because of the MIB
1235 // context pruning).
1236 std::stable_sort(Calls.begin(), Calls.end(),
1237 [](const CallContextInfo &A, const CallContextInfo &B) {
1238 auto &IdsA = std::get<1>(A);
1239 auto &IdsB = std::get<1>(B);
1240 return IdsA.size() > IdsB.size() ||
1241 (IdsA.size() == IdsB.size() && IdsA < IdsB);
1242 });
1243
1244 // Find the node for the last stack id, which should be the same
1245 // across all calls recorded for this id, and is the id for this
1246 // entry in the StackIdToMatchingCalls map.
1247 uint64_t LastId = It.getFirst();
1248 ContextNode *LastNode = getNodeForStackId(LastId);
1249 // We should only have kept stack ids that had nodes.
1250 assert(LastNode);
1251
1252 if (LastNode->Recursive)
1253 continue;
1254
1255 // Initialize the context ids with the last node's. We will subsequently
1256 // refine the context ids by computing the intersection along all edges.
1257 DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1258 assert(!LastNodeContextIds.empty());
1259
1260 for (unsigned I = 0; I < Calls.size(); I++) {
1261 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1262 assert(SavedContextIds.empty());
1263 assert(LastId == Ids.back());
1264
1265 // First compute the context ids for this stack id sequence (the
1266 // intersection of the context ids of the corresponding nodes).
1267 // Start with the remaining saved ids for the last node.
1268 assert(!LastNodeContextIds.empty());
1269 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1270
1271 ContextNode *PrevNode = LastNode;
1272 ContextNode *CurNode = LastNode;
1273 bool Skip = false;
1274
1275 // Iterate backwards through the stack Ids, starting after the last Id
1276 // in the list, which was handled once outside for all Calls.
1277 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1278 auto Id = *IdIter;
1279 CurNode = getNodeForStackId(Id);
1280 // We should only have kept stack ids that had nodes.
1281 assert(CurNode);
1282
1283 if (CurNode->Recursive) {
1284 Skip = true;
1285 break;
1286 }
1287
1288 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1289 // If there is no edge then the nodes belong to different MIB contexts,
1290 // and we should skip this inlined context sequence. For example, this
1291 // particular inlined context may include stack ids A->B, and we may
1292 // indeed have nodes for both A and B, but it is possible that they were
1293 // never profiled in sequence in a single MIB for any allocation (i.e.
1294 // we might have profiled an allocation that involves the callsite A,
1295 // but through a different one of its callee callsites, and we might
1296 // have profiled an allocation that involves callsite B, but reached
1297 // from a different caller callsite).
1298 if (!Edge) {
1299 Skip = true;
1300 break;
1301 }
1302 PrevNode = CurNode;
1303
1304 // Update the context ids, which is the intersection of the ids along
1305 // all edges in the sequence.
1306 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1307
1308 // If we now have no context ids for clone, skip this call.
1309 if (StackSequenceContextIds.empty()) {
1310 Skip = true;
1311 break;
1312 }
1313 }
1314 if (Skip)
1315 continue;
1316
1317 // If some of this call's stack ids did not have corresponding nodes (due
1318 // to pruning), don't include any context ids for contexts that extend
1319 // beyond these nodes. Otherwise we would be matching part of unrelated /
1320 // not fully matching stack contexts. To do this, subtract any context ids
1321 // found in caller nodes of the last node found above.
1322 if (Ids.back() != getLastStackId(Call)) {
1323 for (const auto &PE : CurNode->CallerEdges) {
1324 set_subtract(StackSequenceContextIds, PE->getContextIds());
1325 if (StackSequenceContextIds.empty())
1326 break;
1327 }
1328 // If we now have no context ids for clone, skip this call.
1329 if (StackSequenceContextIds.empty())
1330 continue;
1331 }
1332
1333 // Check if the next set of stack ids is the same (since the Calls vector
1334 // of tuples is sorted by the stack ids we can just look at the next one).
1335 bool DuplicateContextIds = false;
1336 if (I + 1 < Calls.size()) {
1337 auto NextIds = std::get<1>(Calls[I + 1]);
1338 DuplicateContextIds = Ids == NextIds;
1339 }
1340
1341 // If we don't have duplicate context ids, then we can assign all the
1342 // context ids computed for the original node sequence to this call.
1343 // If there are duplicate calls with the same stack ids then we synthesize
1344 // new context ids that are duplicates of the originals. These are
1345 // assigned to SavedContextIds, which is a reference into the map entry
1346 // for this call, allowing us to access these ids later on.
1347 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1348 StackSequenceContextIds.size());
1349 SavedContextIds =
1350 DuplicateContextIds
1351 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1352 : StackSequenceContextIds;
1353 assert(!SavedContextIds.empty());
1354
1355 if (!DuplicateContextIds) {
1356 // Update saved last node's context ids to remove those that are
1357 // assigned to other calls, so that it is ready for the next call at
1358 // this stack id.
1359 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1360 if (LastNodeContextIds.empty())
1361 break;
1362 }
1363 }
1364 }
1365
1366 // Propagate the duplicate context ids over the graph.
1367 propagateDuplicateContextIds(OldToNewContextIds);
1368
1369 if (VerifyCCG)
1370 check();
1371
1372 // Now perform a post-order traversal over the graph, starting with the
1373 // allocation nodes, essentially processing nodes from callers to callees.
1374 // For any that contains an id in the map, update the graph to contain new
1375 // nodes representing any inlining at interior callsites. Note we move the
1376 // associated context ids over to the new nodes.
1378 for (auto &Entry : AllocationCallToContextNodeMap)
1379 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1380}
1381
1382uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1384 Call->getMetadata(LLVMContext::MD_callsite));
1385 return CallsiteContext.back();
1386}
1387
1388uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1389 assert(isa<CallsiteInfo *>(Call.getBase()));
1391 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1392 // Need to convert index into stack id.
1393 return Index.getStackIdAtIndex(CallsiteContext.back());
1394}
1395
1396static const std::string MemProfCloneSuffix = ".memprof.";
1397
1398static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1399 // We use CloneNo == 0 to refer to the original version, which doesn't get
1400 // renamed with a suffix.
1401 if (!CloneNo)
1402 return Base.str();
1403 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1404}
1405
1406std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1407 const Instruction *Call,
1408 unsigned CloneNo) const {
1409 return (Twine(Call->getFunction()->getName()) + " -> " +
1410 cast<CallBase>(Call)->getCalledFunction()->getName())
1411 .str();
1412}
1413
1414std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1415 const IndexCall &Call,
1416 unsigned CloneNo) const {
1417 auto VI = FSToVIMap.find(Func);
1418 assert(VI != FSToVIMap.end());
1419 if (isa<AllocInfo *>(Call.getBase()))
1420 return (VI->second.name() + " -> alloc").str();
1421 else {
1422 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1423 return (VI->second.name() + " -> " +
1424 getMemProfFuncName(Callsite->Callee.name(),
1425 Callsite->Clones[CloneNo]))
1426 .str();
1427 }
1428}
1429
1430std::vector<uint64_t>
1431ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1432 Instruction *Call) {
1434 Call->getMetadata(LLVMContext::MD_callsite));
1435 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1436 CallsiteContext);
1437}
1438
1439std::vector<uint64_t>
1440IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1441 assert(isa<CallsiteInfo *>(Call.getBase()));
1443 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1446 CallsiteContext);
1447}
1448
1449template <typename DerivedCCG, typename FuncTy, typename CallTy>
1450template <class NodeT, class IteratorT>
1451std::vector<uint64_t>
1453 CallStack<NodeT, IteratorT> &CallsiteContext) {
1454 std::vector<uint64_t> StackIds;
1455 for (auto IdOrIndex : CallsiteContext) {
1456 auto StackId = getStackId(IdOrIndex);
1457 ContextNode *Node = getNodeForStackId(StackId);
1458 if (!Node)
1459 break;
1460 StackIds.push_back(StackId);
1461 }
1462 return StackIds;
1463}
1464
1467 : Mod(M), OREGetter(OREGetter) {
1468 for (auto &F : M) {
1469 std::vector<CallInfo> CallsWithMetadata;
1470 for (auto &BB : F) {
1471 for (auto &I : BB) {
1472 if (!isa<CallBase>(I))
1473 continue;
1474 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1475 CallsWithMetadata.push_back(&I);
1476 auto *AllocNode = addAllocNode(&I, &F);
1477 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1478 assert(CallsiteMD);
1479 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1480 // Add all of the MIBs and their stack nodes.
1481 for (auto &MDOp : MemProfMD->operands()) {
1482 auto *MIBMD = cast<const MDNode>(MDOp);
1483 MDNode *StackNode = getMIBStackNode(MIBMD);
1484 assert(StackNode);
1485 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1486 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1487 AllocNode, StackContext, CallsiteContext,
1488 getMIBAllocType(MIBMD));
1489 }
1490 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1491 // Memprof and callsite metadata on memory allocations no longer
1492 // needed.
1493 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1494 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1495 }
1496 // For callsite metadata, add to list for this function for later use.
1497 else if (I.getMetadata(LLVMContext::MD_callsite))
1498 CallsWithMetadata.push_back(&I);
1499 }
1500 }
1501 if (!CallsWithMetadata.empty())
1502 FuncToCallsWithMetadata.push_back({&F, CallsWithMetadata});
1503 }
1504
1505 if (DumpCCG) {
1506 dbgs() << "CCG before updating call stack chains:\n";
1507 dbgs() << *this;
1508 }
1509
1510 if (ExportToDot)
1511 exportToDot("prestackupdate");
1512
1514
1516
1517 // Strip off remaining callsite metadata, no longer needed.
1518 for (auto &FuncEntry : FuncToCallsWithMetadata)
1519 for (auto &Call : FuncEntry.second)
1520 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1521}
1522
1526 isPrevailing)
1527 : Index(Index) {
1528 for (auto &I : Index) {
1529 auto VI = Index.getValueInfo(I);
1530 for (auto &S : VI.getSummaryList()) {
1531 // We should only add the prevailing nodes. Otherwise we may try to clone
1532 // in a weak copy that won't be linked (and may be different than the
1533 // prevailing version).
1534 // We only keep the memprof summary on the prevailing copy now when
1535 // building the combined index, as a space optimization, however don't
1536 // rely on this optimization. The linker doesn't resolve local linkage
1537 // values so don't check whether those are prevailing.
1538 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1539 !isPrevailing(VI.getGUID(), S.get()))
1540 continue;
1541 auto *FS = dyn_cast<FunctionSummary>(S.get());
1542 if (!FS)
1543 continue;
1544 std::vector<CallInfo> CallsWithMetadata;
1545 if (!FS->allocs().empty()) {
1546 for (auto &AN : FS->mutableAllocs()) {
1547 // This can happen because of recursion elimination handling that
1548 // currently exists in ModuleSummaryAnalysis. Skip these for now.
1549 // We still added them to the summary because we need to be able to
1550 // correlate properly in applyImport in the backends.
1551 if (AN.MIBs.empty())
1552 continue;
1553 CallsWithMetadata.push_back({&AN});
1554 auto *AllocNode = addAllocNode({&AN}, FS);
1555 // Pass an empty CallStack to the CallsiteContext (second)
1556 // parameter, since for ThinLTO we already collapsed out the inlined
1557 // stack ids on the allocation call during ModuleSummaryAnalysis.
1559 EmptyContext;
1560 // Now add all of the MIBs and their stack nodes.
1561 for (auto &MIB : AN.MIBs) {
1563 StackContext(&MIB);
1564 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1565 AllocNode, StackContext, EmptyContext, MIB.AllocType);
1566 }
1567 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1568 // Initialize version 0 on the summary alloc node to the current alloc
1569 // type, unless it has both types in which case make it default, so
1570 // that in the case where we aren't able to clone the original version
1571 // always ends up with the default allocation behavior.
1572 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1573 }
1574 }
1575 // For callsite metadata, add to list for this function for later use.
1576 if (!FS->callsites().empty())
1577 for (auto &SN : FS->mutableCallsites())
1578 CallsWithMetadata.push_back({&SN});
1579
1580 if (!CallsWithMetadata.empty())
1581 FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata});
1582
1583 if (!FS->allocs().empty() || !FS->callsites().empty())
1584 FSToVIMap[FS] = VI;
1585 }
1586 }
1587
1588 if (DumpCCG) {
1589 dbgs() << "CCG before updating call stack chains:\n";
1590 dbgs() << *this;
1591 }
1592
1593 if (ExportToDot)
1594 exportToDot("prestackupdate");
1595
1596 updateStackNodes();
1597
1598 handleCallsitesWithMultipleTargets();
1599}
1600
1601template <typename DerivedCCG, typename FuncTy, typename CallTy>
1602void CallsiteContextGraph<DerivedCCG, FuncTy,
1603 CallTy>::handleCallsitesWithMultipleTargets() {
1604 // Look for and workaround callsites that call multiple functions.
1605 // This can happen for indirect calls, which needs better handling, and in
1606 // more rare cases (e.g. macro expansion).
1607 // TODO: To fix this for indirect calls we will want to perform speculative
1608 // devirtualization using either the normal PGO info with ICP, or using the
1609 // information in the profiled MemProf contexts. We can do this prior to
1610 // this transformation for regular LTO, and for ThinLTO we can simulate that
1611 // effect in the summary and perform the actual speculative devirtualization
1612 // while cloning in the ThinLTO backend.
1613 for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1614 Entry != NonAllocationCallToContextNodeMap.end();) {
1615 auto *Node = Entry->second;
1616 assert(Node->Clones.empty());
1617 // Check all node callees and see if in the same function.
1618 bool Removed = false;
1619 auto Call = Node->Call.call();
1620 for (auto &Edge : Node->CalleeEdges) {
1621 if (!Edge->Callee->hasCall())
1622 continue;
1623 assert(NodeToCallingFunc.count(Edge->Callee));
1624 // Check if the called function matches that of the callee node.
1625 if (calleeMatchesFunc(Call, NodeToCallingFunc[Edge->Callee]))
1626 continue;
1627 // Work around by setting Node to have a null call, so it gets
1628 // skipped during cloning. Otherwise assignFunctions will assert
1629 // because its data structures are not designed to handle this case.
1630 Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1631 Node->setCall(CallInfo());
1632 Removed = true;
1633 break;
1634 }
1635 if (!Removed)
1636 Entry++;
1637 }
1638}
1639
1640uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1641 // In the Module (IR) case this is already the Id.
1642 return IdOrIndex;
1643}
1644
1645uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1646 // In the Index case this is an index into the stack id list in the summary
1647 // index, convert it to an Id.
1648 return Index.getStackIdAtIndex(IdOrIndex);
1649}
1650
1651bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
1652 const Function *Func) {
1653 auto *CB = dyn_cast<CallBase>(Call);
1654 if (!CB->getCalledOperand())
1655 return false;
1656 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1657 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
1658 if (CalleeFunc == Func)
1659 return true;
1660 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
1661 return Alias && Alias->getAliasee() == Func;
1662}
1663
1664bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call,
1665 const FunctionSummary *Func) {
1667 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
1668 // If there is no summary list then this is a call to an externally defined
1669 // symbol.
1670 AliasSummary *Alias =
1671 Callee.getSummaryList().empty()
1672 ? nullptr
1673 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
1674 assert(FSToVIMap.count(Func));
1675 return Callee == FSToVIMap[Func] ||
1676 // If callee is an alias, check the aliasee, since only function
1677 // summary base objects will contain the stack node summaries and thus
1678 // get a context node.
1679 (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]);
1680}
1681
1682static std::string getAllocTypeString(uint8_t AllocTypes) {
1683 if (!AllocTypes)
1684 return "None";
1685 std::string Str;
1686 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1687 Str += "NotCold";
1688 if (AllocTypes & (uint8_t)AllocationType::Cold)
1689 Str += "Cold";
1690 return Str;
1691}
1692
1693template <typename DerivedCCG, typename FuncTy, typename CallTy>
1695 const {
1696 print(dbgs());
1697 dbgs() << "\n";
1698}
1699
1700template <typename DerivedCCG, typename FuncTy, typename CallTy>
1702 raw_ostream &OS) const {
1703 OS << "Node " << this << "\n";
1704 OS << "\t";
1705 printCall(OS);
1706 if (Recursive)
1707 OS << " (recursive)";
1708 OS << "\n";
1709 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
1710 OS << "\tContextIds:";
1711 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1712 std::sort(SortedIds.begin(), SortedIds.end());
1713 for (auto Id : SortedIds)
1714 OS << " " << Id;
1715 OS << "\n";
1716 OS << "\tCalleeEdges:\n";
1717 for (auto &Edge : CalleeEdges)
1718 OS << "\t\t" << *Edge << "\n";
1719 OS << "\tCallerEdges:\n";
1720 for (auto &Edge : CallerEdges)
1721 OS << "\t\t" << *Edge << "\n";
1722 if (!Clones.empty()) {
1723 OS << "\tClones: ";
1724 FieldSeparator FS;
1725 for (auto *Clone : Clones)
1726 OS << FS << Clone;
1727 OS << "\n";
1728 } else if (CloneOf) {
1729 OS << "\tClone of " << CloneOf << "\n";
1730 }
1731}
1732
1733template <typename DerivedCCG, typename FuncTy, typename CallTy>
1735 const {
1736 print(dbgs());
1737 dbgs() << "\n";
1738}
1739
1740template <typename DerivedCCG, typename FuncTy, typename CallTy>
1742 raw_ostream &OS) const {
1743 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
1744 << " AllocTypes: " << getAllocTypeString(AllocTypes);
1745 OS << " ContextIds:";
1746 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1747 std::sort(SortedIds.begin(), SortedIds.end());
1748 for (auto Id : SortedIds)
1749 OS << " " << Id;
1750}
1751
1752template <typename DerivedCCG, typename FuncTy, typename CallTy>
1754 print(dbgs());
1755}
1756
1757template <typename DerivedCCG, typename FuncTy, typename CallTy>
1759 raw_ostream &OS) const {
1760 OS << "Callsite Context Graph:\n";
1761 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1762 for (const auto Node : nodes<GraphType>(this)) {
1763 if (Node->isRemoved())
1764 continue;
1765 Node->print(OS);
1766 OS << "\n";
1767 }
1768}
1769
1770template <typename DerivedCCG, typename FuncTy, typename CallTy>
1771static void checkEdge(
1772 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1773 // Confirm that alloc type is not None and that we have at least one context
1774 // id.
1775 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1776 assert(!Edge->ContextIds.empty());
1777}
1778
1779template <typename DerivedCCG, typename FuncTy, typename CallTy>
1781 bool CheckEdges = true) {
1782 if (Node->isRemoved())
1783 return;
1784 // Node's context ids should be the union of both its callee and caller edge
1785 // context ids.
1786 if (Node->CallerEdges.size()) {
1787 auto EI = Node->CallerEdges.begin();
1788 auto &FirstEdge = *EI;
1789 EI++;
1790 DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
1791 for (; EI != Node->CallerEdges.end(); EI++) {
1792 const auto &Edge = *EI;
1793 if (CheckEdges)
1794 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1795 set_union(CallerEdgeContextIds, Edge->ContextIds);
1796 }
1797 // Node can have more context ids than callers if some contexts terminate at
1798 // node and some are longer.
1799 assert(Node->ContextIds == CallerEdgeContextIds ||
1800 set_is_subset(CallerEdgeContextIds, Node->ContextIds));
1801 }
1802 if (Node->CalleeEdges.size()) {
1803 auto EI = Node->CalleeEdges.begin();
1804 auto &FirstEdge = *EI;
1805 EI++;
1806 DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
1807 for (; EI != Node->CalleeEdges.end(); EI++) {
1808 const auto &Edge = *EI;
1809 if (CheckEdges)
1810 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1811 set_union(CalleeEdgeContextIds, Edge->ContextIds);
1812 }
1813 assert(Node->ContextIds == CalleeEdgeContextIds);
1814 }
1815}
1816
1817template <typename DerivedCCG, typename FuncTy, typename CallTy>
1819 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1820 for (const auto Node : nodes<GraphType>(this)) {
1821 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
1822 for (auto &Edge : Node->CallerEdges)
1823 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1824 }
1825}
1826
1827template <typename DerivedCCG, typename FuncTy, typename CallTy>
1828struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
1831
1832 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
1833 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
1834
1837 decltype(&getNode)>;
1838
1840 return nodes_iterator(G->NodeOwner.begin(), &getNode);
1841 }
1842
1844 return nodes_iterator(G->NodeOwner.end(), &getNode);
1845 }
1846
1848 return G->NodeOwner.begin()->get();
1849 }
1850
1851 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
1854 return P->Callee;
1855 }
1856
1858 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
1859 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
1860 decltype(&GetCallee)>;
1861
1863 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
1864 }
1865
1867 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
1868 }
1869};
1870
1871template <typename DerivedCCG, typename FuncTy, typename CallTy>
1872struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
1873 : public DefaultDOTGraphTraits {
1874 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
1875
1878 using NodeRef = typename GTraits::NodeRef;
1879 using ChildIteratorType = typename GTraits::ChildIteratorType;
1880
1881 static std::string getNodeLabel(NodeRef Node, GraphType G) {
1882 std::string LabelString =
1883 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
1884 Twine(Node->OrigStackOrAllocId))
1885 .str();
1886 LabelString += "\n";
1887 if (Node->hasCall()) {
1888 auto Func = G->NodeToCallingFunc.find(Node);
1889 assert(Func != G->NodeToCallingFunc.end());
1890 LabelString +=
1891 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
1892 } else {
1893 LabelString += "null call";
1894 if (Node->Recursive)
1895 LabelString += " (recursive)";
1896 else
1897 LabelString += " (external)";
1898 }
1899 return LabelString;
1900 }
1901
1902 static std::string getNodeAttributes(NodeRef Node, GraphType) {
1903 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
1904 getContextIds(Node->ContextIds) + "\"")
1905 .str();
1907 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
1908 AttributeString += ",style=\"filled\"";
1909 if (Node->CloneOf) {
1910 AttributeString += ",color=\"blue\"";
1911 AttributeString += ",style=\"filled,bold,dashed\"";
1912 } else
1913 AttributeString += ",style=\"filled\"";
1914 return AttributeString;
1915 }
1916
1917 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
1918 GraphType) {
1919 auto &Edge = *(ChildIter.getCurrent());
1920 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
1921 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
1922 .str();
1923 }
1924
1925 // Since the NodeOwners list includes nodes that are no longer connected to
1926 // the graph, skip them here.
1927 static bool isNodeHidden(NodeRef Node, GraphType) {
1928 return Node->isRemoved();
1929 }
1930
1931private:
1932 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
1933 std::string IdString = "ContextIds:";
1934 if (ContextIds.size() < 100) {
1935 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1936 std::sort(SortedIds.begin(), SortedIds.end());
1937 for (auto Id : SortedIds)
1938 IdString += (" " + Twine(Id)).str();
1939 } else {
1940 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
1941 }
1942 return IdString;
1943 }
1944
1945 static std::string getColor(uint8_t AllocTypes) {
1946 if (AllocTypes == (uint8_t)AllocationType::NotCold)
1947 // Color "brown1" actually looks like a lighter red.
1948 return "brown1";
1949 if (AllocTypes == (uint8_t)AllocationType::Cold)
1950 return "cyan";
1951 if (AllocTypes ==
1952 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
1953 // Lighter purple.
1954 return "mediumorchid1";
1955 return "gray";
1956 }
1957
1958 static std::string getNodeId(NodeRef Node) {
1959 std::stringstream SStream;
1960 SStream << std::hex << "N0x" << (unsigned long long)Node;
1961 std::string Result = SStream.str();
1962 return Result;
1963 }
1964};
1965
1966template <typename DerivedCCG, typename FuncTy, typename CallTy>
1968 std::string Label) const {
1969 WriteGraph(this, "", false, Label,
1970 DotFilePathPrefix + "ccg." + Label + ".dot");
1971}
1972
1973template <typename DerivedCCG, typename FuncTy, typename CallTy>
1976 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) {
1977 ContextNode *Node = Edge->Callee;
1978 NodeOwner.push_back(
1979 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
1980 ContextNode *Clone = NodeOwner.back().get();
1981 Node->addClone(Clone);
1984 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true);
1985 return Clone;
1986}
1987
1988template <typename DerivedCCG, typename FuncTy, typename CallTy>
1990 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
1991 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
1992 bool NewClone) {
1993 // NewCallee and Edge's current callee must be clones of the same original
1994 // node (Edge's current callee may be the original node too).
1995 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
1996 auto &EdgeContextIds = Edge->getContextIds();
1997 ContextNode *OldCallee = Edge->Callee;
1998 if (CallerEdgeI)
1999 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2000 else
2001 OldCallee->eraseCallerEdge(Edge.get());
2002 Edge->Callee = NewCallee;
2003 NewCallee->CallerEdges.push_back(Edge);
2004 // Don't need to update Edge's context ids since we are simply reconnecting
2005 // it.
2006 set_subtract(OldCallee->ContextIds, EdgeContextIds);
2007 NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end());
2008 NewCallee->AllocTypes |= Edge->AllocTypes;
2009 OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
2010 // OldCallee alloc type should be None iff its context id set is now empty.
2011 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2012 OldCallee->ContextIds.empty());
2013 // Now walk the old callee node's callee edges and move Edge's context ids
2014 // over to the corresponding edge into the clone (which is created here if
2015 // this is a newly created clone).
2016 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2017 // The context ids moving to the new callee are the subset of this edge's
2018 // context ids and the context ids on the caller edge being moved.
2019 DenseSet<uint32_t> EdgeContextIdsToMove =
2020 set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds);
2021 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2022 OldCalleeEdge->AllocTypes =
2023 computeAllocType(OldCalleeEdge->getContextIds());
2024 if (!NewClone) {
2025 // Update context ids / alloc type on corresponding edge to NewCallee.
2026 // There is a chance this may not exist if we are reusing an existing
2027 // clone, specifically during function assignment, where we would have
2028 // removed none type edges after creating the clone. If we can't find
2029 // a corresponding edge there, fall through to the cloning below.
2030 if (auto *NewCalleeEdge =
2031 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2032 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2033 EdgeContextIdsToMove.end());
2034 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2035 continue;
2036 }
2037 }
2038 auto NewEdge = std::make_shared<ContextEdge>(
2039 OldCalleeEdge->Callee, NewCallee,
2040 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2041 NewCallee->CalleeEdges.push_back(NewEdge);
2042 NewEdge->Callee->CallerEdges.push_back(NewEdge);
2043 }
2044 if (VerifyCCG) {
2045 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2046 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2047 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2048 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2049 /*CheckEdges=*/false);
2050 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2051 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2052 /*CheckEdges=*/false);
2053 }
2054}
2055
2056template <typename DerivedCCG, typename FuncTy, typename CallTy>
2059 for (auto &Entry : AllocationCallToContextNodeMap)
2060 identifyClones(Entry.second, Visited);
2061}
2062
2063// helper function to check an AllocType is cold or notcold or both.
2065 return (AllocType == (uint8_t)AllocationType::Cold) ||
2066 (AllocType == (uint8_t)AllocationType::NotCold) ||
2067 (AllocType ==
2068 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2069}
2070
2071template <typename DerivedCCG, typename FuncTy, typename CallTy>
2074 if (VerifyNodes)
2075 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2076 assert(!Node->CloneOf);
2077
2078 // If Node as a null call, then either it wasn't found in the module (regular
2079 // LTO) or summary index (ThinLTO), or there were other conditions blocking
2080 // cloning (e.g. recursion, calls multiple targets, etc).
2081 // Do this here so that we don't try to recursively clone callers below, which
2082 // isn't useful at least for this node.
2083 if (!Node->hasCall())
2084 return;
2085
2086#ifndef NDEBUG
2087 auto Insert =
2088#endif
2089 Visited.insert(Node);
2090 // We should not have visited this node yet.
2091 assert(Insert.second);
2092 // The recursive call to identifyClones may delete the current edge from the
2093 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2094 // in an iterator and having recursive call erase from it. Other edges may
2095 // also get removed during the recursion, which will have null Callee and
2096 // Caller pointers (and are deleted later), so we skip those below.
2097 {
2098 auto CallerEdges = Node->CallerEdges;
2099 for (auto &Edge : CallerEdges) {
2100 // Skip any that have been removed by an earlier recursive call.
2101 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2102 assert(!std::count(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2103 Edge));
2104 continue;
2105 }
2106 // Ignore any caller we previously visited via another edge.
2107 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2108 identifyClones(Edge->Caller, Visited);
2109 }
2110 }
2111 }
2112
2113 // Check if we reached an unambiguous call or have have only a single caller.
2114 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2115 return;
2116
2117 // We need to clone.
2118
2119 // Try to keep the original version as alloc type NotCold. This will make
2120 // cases with indirect calls or any other situation with an unknown call to
2121 // the original function get the default behavior. We do this by sorting the
2122 // CallerEdges of the Node we will clone by alloc type.
2123 //
2124 // Give NotCold edge the lowest sort priority so those edges are at the end of
2125 // the caller edges vector, and stay on the original version (since the below
2126 // code clones greedily until it finds all remaining edges have the same type
2127 // and leaves the remaining ones on the original Node).
2128 //
2129 // We shouldn't actually have any None type edges, so the sorting priority for
2130 // that is arbitrary, and we assert in that case below.
2131 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2132 /*Cold*/ 1,
2133 /*NotColdCold*/ 2};
2134 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2135 [&](const std::shared_ptr<ContextEdge> &A,
2136 const std::shared_ptr<ContextEdge> &B) {
2137 assert(checkColdOrNotCold(A->AllocTypes) &&
2138 checkColdOrNotCold(B->AllocTypes));
2139
2140 if (A->AllocTypes == B->AllocTypes)
2141 // Use the first context id for each edge as a
2142 // tie-breaker.
2143 return *A->ContextIds.begin() < *B->ContextIds.begin();
2144 return AllocTypeCloningPriority[A->AllocTypes] <
2145 AllocTypeCloningPriority[B->AllocTypes];
2146 });
2147
2148 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2149
2150 // Iterate until we find no more opportunities for disambiguating the alloc
2151 // types via cloning. In most cases this loop will terminate once the Node
2152 // has a single allocation type, in which case no more cloning is needed.
2153 // We need to be able to remove Edge from CallerEdges, so need to adjust
2154 // iterator inside the loop.
2155 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2156 auto CallerEdge = *EI;
2157
2158 // See if cloning the prior caller edge left this node with a single alloc
2159 // type or a single caller. In that case no more cloning of Node is needed.
2160 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2161 break;
2162
2163 // Compute the node callee edge alloc types corresponding to the context ids
2164 // for this caller edge.
2165 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2166 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2167 for (auto &CalleeEdge : Node->CalleeEdges)
2168 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2169 CalleeEdge->getContextIds(), CallerEdge->getContextIds()));
2170
2171 // Don't clone if doing so will not disambiguate any alloc types amongst
2172 // caller edges (including the callee edges that would be cloned).
2173 // Otherwise we will simply move all edges to the clone.
2174 //
2175 // First check if by cloning we will disambiguate the caller allocation
2176 // type from node's allocation type. Query allocTypeToUse so that we don't
2177 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2178 // neither of these should be None type.
2179 //
2180 // Then check if by cloning node at least one of the callee edges will be
2181 // disambiguated by splitting out different context ids.
2182 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2183 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2184 if (allocTypeToUse(CallerEdge->AllocTypes) ==
2185 allocTypeToUse(Node->AllocTypes) &&
2186 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2187 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2188 ++EI;
2189 continue;
2190 }
2191
2192 // First see if we can use an existing clone. Check each clone and its
2193 // callee edges for matching alloc types.
2194 ContextNode *Clone = nullptr;
2195 for (auto *CurClone : Node->Clones) {
2196 if (allocTypeToUse(CurClone->AllocTypes) !=
2197 allocTypeToUse(CallerEdge->AllocTypes))
2198 continue;
2199
2200 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2201 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2202 continue;
2203 Clone = CurClone;
2204 break;
2205 }
2206
2207 // The edge iterator is adjusted when we move the CallerEdge to the clone.
2208 if (Clone)
2209 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI);
2210 else
2211 Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI);
2212
2213 assert(EI == Node->CallerEdges.end() ||
2214 Node->AllocTypes != (uint8_t)AllocationType::None);
2215 // Sanity check that no alloc types on clone or its edges are None.
2216 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2218 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2219 return E->AllocTypes == (uint8_t)AllocationType::None;
2220 }));
2221 }
2222
2223 // Cloning may have resulted in some cloned callee edges with type None,
2224 // because they aren't carrying any contexts. Remove those edges.
2225 for (auto *Clone : Node->Clones) {
2227 if (VerifyNodes)
2228 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2229 }
2230 // We should still have some context ids on the original Node.
2231 assert(!Node->ContextIds.empty());
2232
2233 // Remove any callee edges that ended up with alloc type None after creating
2234 // clones and updating callee edges.
2236
2237 // Sanity check that no alloc types on node or edges are None.
2238 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2239 assert(llvm::none_of(Node->CalleeEdges,
2240 [&](const std::shared_ptr<ContextEdge> &E) {
2241 return E->AllocTypes == (uint8_t)AllocationType::None;
2242 }));
2243 assert(llvm::none_of(Node->CallerEdges,
2244 [&](const std::shared_ptr<ContextEdge> &E) {
2245 return E->AllocTypes == (uint8_t)AllocationType::None;
2246 }));
2247
2248 if (VerifyNodes)
2249 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2250}
2251
2252void ModuleCallsiteContextGraph::updateAllocationCall(
2254 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2255 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2256 "memprof", AllocTypeString);
2257 cast<CallBase>(Call.call())->addFnAttr(A);
2258 OREGetter(Call.call()->getFunction())
2259 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2260 << ore::NV("AllocationCall", Call.call()) << " in clone "
2261 << ore::NV("Caller", Call.call()->getFunction())
2262 << " marked with memprof allocation attribute "
2263 << ore::NV("Attribute", AllocTypeString));
2264}
2265
2266void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2268 auto *AI = Call.call().dyn_cast<AllocInfo *>();
2269 assert(AI);
2270 assert(AI->Versions.size() > Call.cloneNo());
2271 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2272}
2273
2274void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2275 FuncInfo CalleeFunc) {
2276 if (CalleeFunc.cloneNo() > 0)
2277 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2278 OREGetter(CallerCall.call()->getFunction())
2279 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2280 << ore::NV("Call", CallerCall.call()) << " in clone "
2281 << ore::NV("Caller", CallerCall.call()->getFunction())
2282 << " assigned to call function clone "
2283 << ore::NV("Callee", CalleeFunc.func()));
2284}
2285
2286void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2287 FuncInfo CalleeFunc) {
2288 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2289 assert(CI &&
2290 "Caller cannot be an allocation which should not have profiled calls");
2291 assert(CI->Clones.size() > CallerCall.cloneNo());
2292 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2293}
2294
2297ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2298 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2299 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2300 // Use existing LLVM facilities for cloning and obtaining Call in clone
2301 ValueToValueMapTy VMap;
2302 auto *NewFunc = CloneFunction(Func.func(), VMap);
2303 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2304 assert(!Func.func()->getParent()->getFunction(Name));
2305 NewFunc->setName(Name);
2306 for (auto &Inst : CallsWithMetadataInFunc) {
2307 // This map always has the initial version in it.
2308 assert(Inst.cloneNo() == 0);
2309 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2310 }
2311 OREGetter(Func.func())
2312 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2313 << "created clone " << ore::NV("NewFunction", NewFunc));
2314 return {NewFunc, CloneNo};
2315}
2316
2319IndexCallsiteContextGraph::cloneFunctionForCallsite(
2320 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2321 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2322 // Check how many clones we have of Call (and therefore function).
2323 // The next clone number is the current size of versions array.
2324 // Confirm this matches the CloneNo provided by the caller, which is based on
2325 // the number of function clones we have.
2326 assert(CloneNo ==
2327 (Call.call().is<AllocInfo *>()
2328 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2329 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2330 // Walk all the instructions in this function. Create a new version for
2331 // each (by adding an entry to the Versions/Clones summary array), and copy
2332 // over the version being called for the function clone being cloned here.
2333 // Additionally, add an entry to the CallMap for the new function clone,
2334 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2335 // to the new call clone.
2336 for (auto &Inst : CallsWithMetadataInFunc) {
2337 // This map always has the initial version in it.
2338 assert(Inst.cloneNo() == 0);
2339 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2340 assert(AI->Versions.size() == CloneNo);
2341 // We assign the allocation type later (in updateAllocationCall), just add
2342 // an entry for it here.
2343 AI->Versions.push_back(0);
2344 } else {
2345 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2346 assert(CI && CI->Clones.size() == CloneNo);
2347 // We assign the clone number later (in updateCall), just add an entry for
2348 // it here.
2349 CI->Clones.push_back(0);
2350 }
2351 CallMap[Inst] = {Inst.call(), CloneNo};
2352 }
2353 return {Func.func(), CloneNo};
2354}
2355
2356// This method assigns cloned callsites to functions, cloning the functions as
2357// needed. The assignment is greedy and proceeds roughly as follows:
2358//
2359// For each function Func:
2360// For each call with graph Node having clones:
2361// Initialize ClonesWorklist to Node and its clones
2362// Initialize NodeCloneCount to 0
2363// While ClonesWorklist is not empty:
2364// Clone = pop front ClonesWorklist
2365// NodeCloneCount++
2366// If Func has been cloned less than NodeCloneCount times:
2367// If NodeCloneCount is 1:
2368// Assign Clone to original Func
2369// Continue
2370// Create a new function clone
2371// If other callers not assigned to call a function clone yet:
2372// Assign them to call new function clone
2373// Continue
2374// Assign any other caller calling the cloned version to new clone
2375//
2376// For each caller of Clone:
2377// If caller is assigned to call a specific function clone:
2378// If we cannot assign Clone to that function clone:
2379// Create new callsite Clone NewClone
2380// Add NewClone to ClonesWorklist
2381// Continue
2382// Assign Clone to existing caller's called function clone
2383// Else:
2384// If Clone not already assigned to a function clone:
2385// Assign to first function clone without assignment
2386// Assign caller to selected function clone
2387template <typename DerivedCCG, typename FuncTy, typename CallTy>
2389 bool Changed = false;
2390
2391 // Keep track of the assignment of nodes (callsites) to function clones they
2392 // call.
2393 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2394
2395 // Update caller node to call function version CalleeFunc, by recording the
2396 // assignment in CallsiteToCalleeFuncCloneMap.
2397 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2398 const FuncInfo &CalleeFunc) {
2399 assert(Caller->hasCall());
2400 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2401 };
2402
2403 // Walk all functions for which we saw calls with memprof metadata, and handle
2404 // cloning for each of its calls.
2405 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2406 FuncInfo OrigFunc(Func);
2407 // Map from each clone of OrigFunc to a map of remappings of each call of
2408 // interest (from original uncloned call to the corresponding cloned call in
2409 // that function clone).
2410 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2411 for (auto &Call : CallsWithMetadata) {
2412 ContextNode *Node = getNodeForInst(Call);
2413 // Skip call if we do not have a node for it (all uses of its stack ids
2414 // were either on inlined chains or pruned from the MIBs), or if we did
2415 // not create any clones for it.
2416 if (!Node || Node->Clones.empty())
2417 continue;
2418 assert(Node->hasCall() &&
2419 "Not having a call should have prevented cloning");
2420
2421 // Track the assignment of function clones to clones of the current
2422 // callsite Node being handled.
2423 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2424
2425 // Assign callsite version CallsiteClone to function version FuncClone,
2426 // and also assign (possibly cloned) Call to CallsiteClone.
2427 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2428 CallInfo &Call,
2429 ContextNode *CallsiteClone,
2430 bool IsAlloc) {
2431 // Record the clone of callsite node assigned to this function clone.
2432 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2433
2434 assert(FuncClonesToCallMap.count(FuncClone));
2435 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2436 CallInfo CallClone(Call);
2437 if (CallMap.count(Call))
2438 CallClone = CallMap[Call];
2439 CallsiteClone->setCall(CallClone);
2440 };
2441
2442 // Keep track of the clones of callsite Node that need to be assigned to
2443 // function clones. This list may be expanded in the loop body below if we
2444 // find additional cloning is required.
2445 std::deque<ContextNode *> ClonesWorklist;
2446 // Ignore original Node if we moved all of its contexts to clones.
2447 if (!Node->ContextIds.empty())
2448 ClonesWorklist.push_back(Node);
2449 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2450 Node->Clones.end());
2451
2452 // Now walk through all of the clones of this callsite Node that we need,
2453 // and determine the assignment to a corresponding clone of the current
2454 // function (creating new function clones as needed).
2455 unsigned NodeCloneCount = 0;
2456 while (!ClonesWorklist.empty()) {
2457 ContextNode *Clone = ClonesWorklist.front();
2458 ClonesWorklist.pop_front();
2459 NodeCloneCount++;
2460 if (VerifyNodes)
2461 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2462
2463 // Need to create a new function clone if we have more callsite clones
2464 // than existing function clones, which would have been assigned to an
2465 // earlier clone in the list (we assign callsite clones to function
2466 // clones greedily).
2467 if (FuncClonesToCallMap.size() < NodeCloneCount) {
2468 // If this is the first callsite copy, assign to original function.
2469 if (NodeCloneCount == 1) {
2470 // Since FuncClonesToCallMap is empty in this case, no clones have
2471 // been created for this function yet, and no callers should have
2472 // been assigned a function clone for this callee node yet.
2474 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2475 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2476 }));
2477 // Initialize with empty call map, assign Clone to original function
2478 // and its callers, and skip to the next clone.
2479 FuncClonesToCallMap[OrigFunc] = {};
2480 AssignCallsiteCloneToFuncClone(
2481 OrigFunc, Call, Clone,
2482 AllocationCallToContextNodeMap.count(Call));
2483 for (auto &CE : Clone->CallerEdges) {
2484 // Ignore any caller that does not have a recorded callsite Call.
2485 if (!CE->Caller->hasCall())
2486 continue;
2487 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2488 }
2489 continue;
2490 }
2491
2492 // First locate which copy of OrigFunc to clone again. If a caller
2493 // of this callsite clone was already assigned to call a particular
2494 // function clone, we need to redirect all of those callers to the
2495 // new function clone, and update their other callees within this
2496 // function.
2497 FuncInfo PreviousAssignedFuncClone;
2498 auto EI = llvm::find_if(
2499 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2500 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2501 });
2502 bool CallerAssignedToCloneOfFunc = false;
2503 if (EI != Clone->CallerEdges.end()) {
2504 const std::shared_ptr<ContextEdge> &Edge = *EI;
2505 PreviousAssignedFuncClone =
2506 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2507 CallerAssignedToCloneOfFunc = true;
2508 }
2509
2510 // Clone function and save it along with the CallInfo map created
2511 // during cloning in the FuncClonesToCallMap.
2512 std::map<CallInfo, CallInfo> NewCallMap;
2513 unsigned CloneNo = FuncClonesToCallMap.size();
2514 assert(CloneNo > 0 && "Clone 0 is the original function, which "
2515 "should already exist in the map");
2516 FuncInfo NewFuncClone = cloneFunctionForCallsite(
2517 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
2518 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
2519 FunctionClonesAnalysis++;
2520 Changed = true;
2521
2522 // If no caller callsites were already assigned to a clone of this
2523 // function, we can simply assign this clone to the new func clone
2524 // and update all callers to it, then skip to the next clone.
2525 if (!CallerAssignedToCloneOfFunc) {
2526 AssignCallsiteCloneToFuncClone(
2527 NewFuncClone, Call, Clone,
2528 AllocationCallToContextNodeMap.count(Call));
2529 for (auto &CE : Clone->CallerEdges) {
2530 // Ignore any caller that does not have a recorded callsite Call.
2531 if (!CE->Caller->hasCall())
2532 continue;
2533 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2534 }
2535 continue;
2536 }
2537
2538 // We may need to do additional node cloning in this case.
2539 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
2540 // that were previously assigned to call PreviousAssignedFuncClone,
2541 // to record that they now call NewFuncClone.
2542 for (auto CE : Clone->CallerEdges) {
2543 // Ignore any caller that does not have a recorded callsite Call.
2544 if (!CE->Caller->hasCall())
2545 continue;
2546
2547 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
2548 // We subsequently fall through to later handling that
2549 // will perform any additional cloning required for
2550 // callers that were calling other function clones.
2551 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
2552 PreviousAssignedFuncClone)
2553 continue;
2554
2555 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2556
2557 // If we are cloning a function that was already assigned to some
2558 // callers, then essentially we are creating new callsite clones
2559 // of the other callsites in that function that are reached by those
2560 // callers. Clone the other callees of the current callsite's caller
2561 // that were already assigned to PreviousAssignedFuncClone
2562 // accordingly. This is important since we subsequently update the
2563 // calls from the nodes in the graph and their assignments to callee
2564 // functions recorded in CallsiteToCalleeFuncCloneMap.
2565 for (auto CalleeEdge : CE->Caller->CalleeEdges) {
2566 // Skip any that have been removed on an earlier iteration when
2567 // cleaning up newly None type callee edges.
2568 if (!CalleeEdge)
2569 continue;
2570 ContextNode *Callee = CalleeEdge->Callee;
2571 // Skip the current callsite, we are looking for other
2572 // callsites Caller calls, as well as any that does not have a
2573 // recorded callsite Call.
2574 if (Callee == Clone || !Callee->hasCall())
2575 continue;
2576 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
2577 removeNoneTypeCalleeEdges(NewClone);
2578 // Moving the edge may have resulted in some none type
2579 // callee edges on the original Callee.
2581 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2582 // If the Callee node was already assigned to call a specific
2583 // function version, make sure its new clone is assigned to call
2584 // that same function clone.
2585 if (CallsiteToCalleeFuncCloneMap.count(Callee))
2586 RecordCalleeFuncOfCallsite(
2587 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
2588 // Update NewClone with the new Call clone of this callsite's Call
2589 // created for the new function clone created earlier.
2590 // Recall that we have already ensured when building the graph
2591 // that each caller can only call callsites within the same
2592 // function, so we are guaranteed that Callee Call is in the
2593 // current OrigFunc.
2594 // CallMap is set up as indexed by original Call at clone 0.
2595 CallInfo OrigCall(Callee->getOrigNode()->Call);
2596 OrigCall.setCloneNo(0);
2597 std::map<CallInfo, CallInfo> &CallMap =
2598 FuncClonesToCallMap[NewFuncClone];
2599 assert(CallMap.count(OrigCall));
2600 CallInfo NewCall(CallMap[OrigCall]);
2601 assert(NewCall);
2602 NewClone->setCall(NewCall);
2603 }
2604 }
2605 // Fall through to handling below to perform the recording of the
2606 // function for this callsite clone. This enables handling of cases
2607 // where the callers were assigned to different clones of a function.
2608 }
2609
2610 // See if we can use existing function clone. Walk through
2611 // all caller edges to see if any have already been assigned to
2612 // a clone of this callsite's function. If we can use it, do so. If not,
2613 // because that function clone is already assigned to a different clone
2614 // of this callsite, then we need to clone again.
2615 // Basically, this checking is needed to handle the case where different
2616 // caller functions/callsites may need versions of this function
2617 // containing different mixes of callsite clones across the different
2618 // callsites within the function. If that happens, we need to create
2619 // additional function clones to handle the various combinations.
2620 //
2621 // Keep track of any new clones of this callsite created by the
2622 // following loop, as well as any existing clone that we decided to
2623 // assign this clone to.
2624 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
2625 FuncInfo FuncCloneAssignedToCurCallsiteClone;
2626 // We need to be able to remove Edge from CallerEdges, so need to adjust
2627 // iterator in the loop.
2628 for (auto EI = Clone->CallerEdges.begin();
2629 EI != Clone->CallerEdges.end();) {
2630 auto Edge = *EI;
2631 // Ignore any caller that does not have a recorded callsite Call.
2632 if (!Edge->Caller->hasCall()) {
2633 EI++;
2634 continue;
2635 }
2636 // If this caller already assigned to call a version of OrigFunc, need
2637 // to ensure we can assign this callsite clone to that function clone.
2638 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
2639 FuncInfo FuncCloneCalledByCaller =
2640 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2641 // First we need to confirm that this function clone is available
2642 // for use by this callsite node clone.
2643 //
2644 // While FuncCloneToCurNodeCloneMap is built only for this Node and
2645 // its callsite clones, one of those callsite clones X could have
2646 // been assigned to the same function clone called by Edge's caller
2647 // - if Edge's caller calls another callsite within Node's original
2648 // function, and that callsite has another caller reaching clone X.
2649 // We need to clone Node again in this case.
2650 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
2651 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
2652 Clone) ||
2653 // Detect when we have multiple callers of this callsite that
2654 // have already been assigned to specific, and different, clones
2655 // of OrigFunc (due to other unrelated callsites in Func they
2656 // reach via call contexts). Is this Clone of callsite Node
2657 // assigned to a different clone of OrigFunc? If so, clone Node
2658 // again.
2659 (FuncCloneAssignedToCurCallsiteClone &&
2660 FuncCloneAssignedToCurCallsiteClone !=
2661 FuncCloneCalledByCaller)) {
2662 // We need to use a different newly created callsite clone, in
2663 // order to assign it to another new function clone on a
2664 // subsequent iteration over the Clones array (adjusted below).
2665 // Note we specifically do not reset the
2666 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
2667 // when this new clone is processed later we know which version of
2668 // the function to copy (so that other callsite clones we have
2669 // assigned to that function clone are properly cloned over). See
2670 // comments in the function cloning handling earlier.
2671
2672 // Check if we already have cloned this callsite again while
2673 // walking through caller edges, for a caller calling the same
2674 // function clone. If so, we can move this edge to that new clone
2675 // rather than creating yet another new clone.
2676 if (FuncCloneToNewCallsiteCloneMap.count(
2677 FuncCloneCalledByCaller)) {
2678 ContextNode *NewClone =
2679 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
2680 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
2681 // Cleanup any none type edges cloned over.
2682 removeNoneTypeCalleeEdges(NewClone);
2683 } else {
2684 // Create a new callsite clone.
2685 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
2686 removeNoneTypeCalleeEdges(NewClone);
2687 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
2688 NewClone;
2689 // Add to list of clones and process later.
2690 ClonesWorklist.push_back(NewClone);
2691 assert(EI == Clone->CallerEdges.end() ||
2692 Clone->AllocTypes != (uint8_t)AllocationType::None);
2693 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2694 }
2695 // Moving the caller edge may have resulted in some none type
2696 // callee edges.
2698 // We will handle the newly created callsite clone in a subsequent
2699 // iteration over this Node's Clones. Continue here since we
2700 // already adjusted iterator EI while moving the edge.
2701 continue;
2702 }
2703
2704 // Otherwise, we can use the function clone already assigned to this
2705 // caller.
2706 if (!FuncCloneAssignedToCurCallsiteClone) {
2707 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
2708 // Assign Clone to FuncCloneCalledByCaller
2709 AssignCallsiteCloneToFuncClone(
2710 FuncCloneCalledByCaller, Call, Clone,
2711 AllocationCallToContextNodeMap.count(Call));
2712 } else
2713 // Don't need to do anything - callsite is already calling this
2714 // function clone.
2715 assert(FuncCloneAssignedToCurCallsiteClone ==
2716 FuncCloneCalledByCaller);
2717
2718 } else {
2719 // We have not already assigned this caller to a version of
2720 // OrigFunc. Do the assignment now.
2721
2722 // First check if we have already assigned this callsite clone to a
2723 // clone of OrigFunc for another caller during this iteration over
2724 // its caller edges.
2725 if (!FuncCloneAssignedToCurCallsiteClone) {
2726 // Find first function in FuncClonesToCallMap without an assigned
2727 // clone of this callsite Node. We should always have one
2728 // available at this point due to the earlier cloning when the
2729 // FuncClonesToCallMap size was smaller than the clone number.
2730 for (auto &CF : FuncClonesToCallMap) {
2731 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
2732 FuncCloneAssignedToCurCallsiteClone = CF.first;
2733 break;
2734 }
2735 }
2736 assert(FuncCloneAssignedToCurCallsiteClone);
2737 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
2738 AssignCallsiteCloneToFuncClone(
2739 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
2740 AllocationCallToContextNodeMap.count(Call));
2741 } else
2742 assert(FuncCloneToCurNodeCloneMap
2743 [FuncCloneAssignedToCurCallsiteClone] == Clone);
2744 // Update callers to record function version called.
2745 RecordCalleeFuncOfCallsite(Edge->Caller,
2746 FuncCloneAssignedToCurCallsiteClone);
2747 }
2748
2749 EI++;
2750 }
2751 }
2752 if (VerifyCCG) {
2753 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2754 for (const auto &PE : Node->CalleeEdges)
2755 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
2756 for (const auto &CE : Node->CallerEdges)
2757 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
2758 for (auto *Clone : Node->Clones) {
2759 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2760 for (const auto &PE : Clone->CalleeEdges)
2761 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
2762 for (const auto &CE : Clone->CallerEdges)
2763 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
2764 }
2765 }
2766 }
2767 }
2768
2769 auto UpdateCalls = [&](ContextNode *Node,
2771 auto &&UpdateCalls) {
2772 auto Inserted = Visited.insert(Node);
2773 if (!Inserted.second)
2774 return;
2775
2776 for (auto *Clone : Node->Clones)
2777 UpdateCalls(Clone, Visited, UpdateCalls);
2778
2779 for (auto &Edge : Node->CallerEdges)
2780 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
2781
2782 // Skip if either no call to update, or if we ended up with no context ids
2783 // (we moved all edges onto other clones).
2784 if (!Node->hasCall() || Node->ContextIds.empty())
2785 return;
2786
2787 if (Node->IsAllocation) {
2788 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
2789 return;
2790 }
2791
2792 if (!CallsiteToCalleeFuncCloneMap.count(Node))
2793 return;
2794
2795 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
2796 updateCall(Node->Call, CalleeFunc);
2797 };
2798
2799 // Performs DFS traversal starting from allocation nodes to update calls to
2800 // reflect cloning decisions recorded earlier. For regular LTO this will
2801 // update the actual calls in the IR to call the appropriate function clone
2802 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
2803 // are recorded in the summary entries.
2805 for (auto &Entry : AllocationCallToContextNodeMap)
2806 UpdateCalls(Entry.second, Visited, UpdateCalls);
2807
2808 return Changed;
2809}
2810
2812 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
2814 &FuncToAliasMap) {
2815 // The first "clone" is the original copy, we should only call this if we
2816 // needed to create new clones.
2817 assert(NumClones > 1);
2819 VMaps.reserve(NumClones - 1);
2820 FunctionsClonedThinBackend++;
2821 for (unsigned I = 1; I < NumClones; I++) {
2822 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
2823 auto *NewF = CloneFunction(&F, *VMaps.back());
2824 FunctionClonesThinBackend++;
2825 // Strip memprof and callsite metadata from clone as they are no longer
2826 // needed.
2827 for (auto &BB : *NewF) {
2828 for (auto &Inst : BB) {
2829 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
2830 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
2831 }
2832 }
2833 std::string Name = getMemProfFuncName(F.getName(), I);
2834 auto *PrevF = M.getFunction(Name);
2835 if (PrevF) {
2836 // We might have created this when adjusting callsite in another
2837 // function. It should be a declaration.
2838 assert(PrevF->isDeclaration());
2839 NewF->takeName(PrevF);
2840 PrevF->replaceAllUsesWith(NewF);
2841 PrevF->eraseFromParent();
2842 } else
2843 NewF->setName(Name);
2844 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
2845 << "created clone " << ore::NV("NewFunction", NewF));
2846
2847 // Now handle aliases to this function, and clone those as well.
2848 if (!FuncToAliasMap.count(&F))
2849 continue;
2850 for (auto *A : FuncToAliasMap[&F]) {
2851 std::string Name = getMemProfFuncName(A->getName(), I);
2852 auto *PrevA = M.getNamedAlias(Name);
2853 auto *NewA = GlobalAlias::create(A->getValueType(),
2854 A->getType()->getPointerAddressSpace(),
2855 A->getLinkage(), Name, NewF);
2856 NewA->copyAttributesFrom(A);
2857 if (PrevA) {
2858 // We might have created this when adjusting callsite in another
2859 // function. It should be a declaration.
2860 assert(PrevA->isDeclaration());
2861 NewA->takeName(PrevA);
2862 PrevA->replaceAllUsesWith(NewA);
2863 PrevA->eraseFromParent();
2864 }
2865 }
2866 }
2867 return VMaps;
2868}
2869
2870// Locate the summary for F. This is complicated by the fact that it might
2871// have been internalized or promoted.
2873 const ModuleSummaryIndex *ImportSummary) {
2874 // FIXME: Ideally we would retain the original GUID in some fashion on the
2875 // function (e.g. as metadata), but for now do our best to locate the
2876 // summary without that information.
2877 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
2878 if (!TheFnVI)
2879 // See if theFn was internalized, by checking index directly with
2880 // original name (this avoids the name adjustment done by getGUID() for
2881 // internal symbols).
2882 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
2883 if (TheFnVI)
2884 return TheFnVI;
2885 // Now query with the original name before any promotion was performed.
2886 StringRef OrigName =
2888 std::string OrigId = GlobalValue::getGlobalIdentifier(
2889 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
2890 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
2891 if (TheFnVI)
2892 return TheFnVI;
2893 // Could be a promoted local imported from another module. We need to pass
2894 // down more info here to find the original module id. For now, try with
2895 // the OrigName which might have been stored in the OidGuidMap in the
2896 // index. This would not work if there were same-named locals in multiple
2897 // modules, however.
2898 auto OrigGUID =
2899 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
2900 if (OrigGUID)
2901 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
2902 return TheFnVI;
2903}
2904
2905bool MemProfContextDisambiguation::applyImport(Module &M) {
2906 assert(ImportSummary);
2907 bool Changed = false;
2908
2909 auto IsMemProfClone = [](const Function &F) {
2910 return F.getName().contains(MemProfCloneSuffix);
2911 };
2912
2913 // We also need to clone any aliases that reference cloned functions, because
2914 // the modified callsites may invoke via the alias. Keep track of the aliases
2915 // for each function.
2916 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
2917 FuncToAliasMap;
2918 for (auto &A : M.aliases()) {
2919 auto *Aliasee = A.getAliaseeObject();
2920 if (auto *F = dyn_cast<Function>(Aliasee))
2921 FuncToAliasMap[F].insert(&A);
2922 }
2923
2924 for (auto &F : M) {
2925 if (F.isDeclaration() || IsMemProfClone(F))
2926 continue;
2927
2929
2931 bool ClonesCreated = false;
2932 unsigned NumClonesCreated = 0;
2933 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
2934 // We should at least have version 0 which is the original copy.
2935 assert(NumClones > 0);
2936 // If only one copy needed use original.
2937 if (NumClones == 1)
2938 return;
2939 // If we already performed cloning of this function, confirm that the
2940 // requested number of clones matches (the thin link should ensure the
2941 // number of clones for each constituent callsite is consistent within
2942 // each function), before returning.
2943 if (ClonesCreated) {
2944 assert(NumClonesCreated == NumClones);
2945 return;
2946 }
2947 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
2948 // The first "clone" is the original copy, which doesn't have a VMap.
2949 assert(VMaps.size() == NumClones - 1);
2950 Changed = true;
2951 ClonesCreated = true;
2952 NumClonesCreated = NumClones;
2953 };
2954
2955 // Locate the summary for F.
2956 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
2957 // If not found, this could be an imported local (see comment in
2958 // findValueInfoForFunc). Skip for now as it will be cloned in its original
2959 // module (where it would have been promoted to global scope so should
2960 // satisfy any reference in this module).
2961 if (!TheFnVI)
2962 continue;
2963
2964 auto *GVSummary =
2965 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
2966 if (!GVSummary)
2967 // Must have been imported, use the first summary (might be multiple if
2968 // this was a linkonce_odr).
2969 GVSummary = TheFnVI.getSummaryList().front().get();
2970
2971 // If this was an imported alias skip it as we won't have the function
2972 // summary, and it should be cloned in the original module.
2973 if (isa<AliasSummary>(GVSummary))
2974 continue;
2975
2976 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
2977
2978 if (FS->allocs().empty() && FS->callsites().empty())
2979 continue;
2980
2981 auto SI = FS->callsites().begin();
2982 auto AI = FS->allocs().begin();
2983
2984 // Assume for now that the instructions are in the exact same order
2985 // as when the summary was created, but confirm this is correct by
2986 // matching the stack ids.
2987 for (auto &BB : F) {
2988 for (auto &I : BB) {
2989 auto *CB = dyn_cast<CallBase>(&I);
2990 // Same handling as when creating module summary.
2991 if (!mayHaveMemprofSummary(CB))
2992 continue;
2993
2995 I.getMetadata(LLVMContext::MD_callsite));
2996 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
2997
2998 // Include allocs that were already assigned a memprof function
2999 // attribute in the statistics.
3000 if (CB->getAttributes().hasFnAttr("memprof")) {
3001 assert(!MemProfMD);
3002 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3003 ? AllocTypeColdThinBackend++
3004 : AllocTypeNotColdThinBackend++;
3005 OrigAllocsThinBackend++;
3006 AllocVersionsThinBackend++;
3007 if (!MaxAllocVersionsThinBackend)
3008 MaxAllocVersionsThinBackend = 1;
3009 // Remove any remaining callsite metadata and we can skip the rest of
3010 // the handling for this instruction, since no cloning needed.
3011 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3012 continue;
3013 }
3014
3015 if (MemProfMD) {
3016 // Consult the next alloc node.
3017 assert(AI != FS->allocs().end());
3018 auto &AllocNode = *(AI++);
3019
3020 // Sanity check that the MIB stack ids match between the summary and
3021 // instruction metadata.
3022 auto MIBIter = AllocNode.MIBs.begin();
3023 for (auto &MDOp : MemProfMD->operands()) {
3024 assert(MIBIter != AllocNode.MIBs.end());
3025 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3026 MIBIter->StackIdIndices.begin();
3027 auto *MIBMD = cast<const MDNode>(MDOp);
3028 MDNode *StackMDNode = getMIBStackNode(MIBMD);
3029 assert(StackMDNode);
3030 SmallVector<unsigned> StackIdsFromMetadata;
3031 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3032 for (auto ContextIter =
3033 StackContext.beginAfterSharedPrefix(CallsiteContext);
3034 ContextIter != StackContext.end(); ++ContextIter) {
3035 // If this is a direct recursion, simply skip the duplicate
3036 // entries, to be consistent with how the summary ids were
3037 // generated during ModuleSummaryAnalysis.
3038 if (!StackIdsFromMetadata.empty() &&
3039 StackIdsFromMetadata.back() == *ContextIter)
3040 continue;
3041 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3042 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3043 *ContextIter);
3044 StackIdIndexIter++;
3045 }
3046 MIBIter++;
3047 }
3048
3049 // Perform cloning if not yet done.
3050 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3051
3052 OrigAllocsThinBackend++;
3053 AllocVersionsThinBackend += AllocNode.Versions.size();
3054 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3055 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3056
3057 // If there is only one version that means we didn't end up
3058 // considering this function for cloning, and in that case the alloc
3059 // will still be none type or should have gotten the default NotCold.
3060 // Skip that after calling clone helper since that does some sanity
3061 // checks that confirm we haven't decided yet that we need cloning.
3062 if (AllocNode.Versions.size() == 1) {
3063 assert((AllocationType)AllocNode.Versions[0] ==
3064 AllocationType::NotCold ||
3065 (AllocationType)AllocNode.Versions[0] ==
3066 AllocationType::None);
3067 UnclonableAllocsThinBackend++;
3068 continue;
3069 }
3070
3071 // All versions should have a singular allocation type.
3072 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3073 return Type == ((uint8_t)AllocationType::NotCold |
3074 (uint8_t)AllocationType::Cold);
3075 }));
3076
3077 // Update the allocation types per the summary info.
3078 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3079 // Ignore any that didn't get an assigned allocation type.
3080 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3081 continue;
3082 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3083 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3084 : AllocTypeNotColdThinBackend++;
3085 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3086 auto A = llvm::Attribute::get(F.getContext(), "memprof",
3087 AllocTypeString);
3088 CallBase *CBClone;
3089 // Copy 0 is the original function.
3090 if (!J)
3091 CBClone = CB;
3092 else
3093 // Since VMaps are only created for new clones, we index with
3094 // clone J-1 (J==0 is the original clone and does not have a VMaps
3095 // entry).
3096 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3097 CBClone->addFnAttr(A);
3098 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3099 << ore::NV("AllocationCall", CBClone) << " in clone "
3100 << ore::NV("Caller", CBClone->getFunction())
3101 << " marked with memprof allocation attribute "
3102 << ore::NV("Attribute", AllocTypeString));
3103 }
3104 } else if (!CallsiteContext.empty()) {
3105 // Consult the next callsite node.
3106 assert(SI != FS->callsites().end());
3107 auto &StackNode = *(SI++);
3108
3109#ifndef NDEBUG
3110 // Sanity check that the stack ids match between the summary and
3111 // instruction metadata.
3112 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3113 for (auto StackId : CallsiteContext) {
3114 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3115 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3116 StackId);
3117 StackIdIndexIter++;
3118 }
3119#endif
3120
3121 // Perform cloning if not yet done.
3122 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3123
3124 // Should have skipped indirect calls via mayHaveMemprofSummary.
3125 assert(CB->getCalledFunction());
3126 assert(!IsMemProfClone(*CB->getCalledFunction()));
3127
3128 // Update the calls per the summary info.
3129 // Save orig name since it gets updated in the first iteration
3130 // below.
3131 auto CalleeOrigName = CB->getCalledFunction()->getName();
3132 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3133 // Do nothing if this version calls the original version of its
3134 // callee.
3135 if (!StackNode.Clones[J])
3136 continue;
3137 auto NewF = M.getOrInsertFunction(
3138 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3139 CB->getCalledFunction()->getFunctionType());
3140 CallBase *CBClone;
3141 // Copy 0 is the original function.
3142 if (!J)
3143 CBClone = CB;
3144 else
3145 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3146 CBClone->setCalledFunction(NewF);
3147 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3148 << ore::NV("Call", CBClone) << " in clone "
3149 << ore::NV("Caller", CBClone->getFunction())
3150 << " assigned to call function clone "
3151 << ore::NV("Callee", NewF.getCallee()));
3152 }
3153 }
3154 // Memprof and callsite metadata on memory allocations no longer needed.
3155 I.setMetadata(LLVMContext::MD_memprof, nullptr);
3156 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3157 }
3158 }
3159 }
3160
3161 return Changed;
3162}
3163
3164template <typename DerivedCCG, typename FuncTy, typename CallTy>
3166 if (DumpCCG) {
3167 dbgs() << "CCG before cloning:\n";
3168 dbgs() << *this;
3169 }
3170 if (ExportToDot)
3171 exportToDot("postbuild");
3172
3173 if (VerifyCCG) {
3174 check();
3175 }
3176
3178
3179 if (VerifyCCG) {
3180 check();
3181 }
3182
3183 if (DumpCCG) {
3184 dbgs() << "CCG after cloning:\n";
3185 dbgs() << *this;
3186 }
3187 if (ExportToDot)
3188 exportToDot("cloned");
3189
3190 bool Changed = assignFunctions();
3191
3192 if (DumpCCG) {
3193 dbgs() << "CCG after assigning function clones:\n";
3194 dbgs() << *this;
3195 }
3196 if (ExportToDot)
3197 exportToDot("clonefuncassign");
3198
3199 return Changed;
3200}
3201
3202bool MemProfContextDisambiguation::processModule(
3203 Module &M,
3205
3206 // If we have an import summary, then the cloning decisions were made during
3207 // the thin link on the index. Apply them and return.
3208 if (ImportSummary)
3209 return applyImport(M);
3210
3211 // TODO: If/when other types of memprof cloning are enabled beyond just for
3212 // hot and cold, we will need to change this to individually control the
3213 // AllocationType passed to addStackNodesForMIB during CCG construction.
3214 // Note that we specifically check this after applying imports above, so that
3215 // the option isn't needed to be passed to distributed ThinLTO backend
3216 // clang processes, which won't necessarily have visibility into the linker
3217 // dependences. Instead the information is communicated from the LTO link to
3218 // the backends via the combined summary index.
3219 if (!SupportsHotColdNew)
3220 return false;
3221
3222 ModuleCallsiteContextGraph CCG(M, OREGetter);
3223 return CCG.process();
3224}
3225
3227 const ModuleSummaryIndex *Summary)
3228 : ImportSummary(Summary) {
3229 if (ImportSummary) {
3230 // The MemProfImportSummary should only be used for testing ThinLTO
3231 // distributed backend handling via opt, in which case we don't have a
3232 // summary from the pass pipeline.
3234 return;
3235 }
3236 if (MemProfImportSummary.empty())
3237 return;
3238
3239 auto ReadSummaryFile =
3241 if (!ReadSummaryFile) {
3242 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3243 "Error loading file '" + MemProfImportSummary +
3244 "': ");
3245 return;
3246 }
3247 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3248 if (!ImportSummaryForTestingOrErr) {
3249 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3250 "Error parsing file '" + MemProfImportSummary +
3251 "': ");
3252 return;
3253 }
3254 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3255 ImportSummary = ImportSummaryForTesting.get();
3256}
3257
3260 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3261 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3263 };
3264 if (!processModule(M, OREGetter))
3265 return PreservedAnalyses::all();
3266 return PreservedAnalyses::none();
3267}
3268
3272 isPrevailing) {
3273 // TODO: If/when other types of memprof cloning are enabled beyond just for
3274 // hot and cold, we will need to change this to individually control the
3275 // AllocationType passed to addStackNodesForMIB during CCG construction.
3276 // The index was set from the option, so these should be in sync.
3277 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3278 if (!SupportsHotColdNew)
3279 return;
3280
3281 IndexCallsiteContextGraph CCG(Index, isPrevailing);
3282 CCG.process();
3283}
aarch64 promote const
amdgpu Simplify well known AMD library false FunctionCallee Callee
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:172
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
#define check(cond)
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
typename CallsiteContextGraph< DerivedCCG, FuncTy, CallTy >::CallInfo CallInfo
typename CallsiteContextGraph< DerivedCCG, FuncTy, CallTy >::FuncInfo FuncInfo
cl::opt< bool > SupportsHotColdNew("supports-hot-cold-new", cl::init(false), cl::Hidden, cl::desc("Linking with hot/cold operator new interfaces"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
static std::string getAllocTypeString(uint8_t AllocTypes)
typename CallsiteContextGraph< DerivedCCG, FuncTy, CallTy >::ContextNode ContextNode
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
typename CallsiteContextGraph< DerivedCCG, FuncTy, CallTy >::ContextEdge ContextEdge
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
Module.h This file contains the declarations for the Module class.
#define P(N)
Module * Mod
FunctionAnalysisManager FAM
@ SI
@ VI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
CRTP base for graphs built from either IR or ThinLTO summary index.
void addStackNodesForMIB(ContextNode *AllocNode, CallStack< NodeT, IteratorT > &StackContext, CallStack< NodeT, IteratorT > &CallsiteContext, AllocationType AllocType)
Adds nodes for the given MIB stack ids.
void removeNoneTypeCalleeEdges(ContextNode *Node)
Helper to remove callee edges that have allocation type None (due to not carrying any context ids) af...
CallsiteContextGraph(CallsiteContextGraph &&)=default
std::map< const ContextNode *, const FuncTy * > NodeToCallingFunc
Map from callsite node to the enclosing caller function.
void identifyClones()
Perform cloning on the graph necessary to uniquely identify the allocation behavior of an allocation ...
void updateStackNodes()
Matches all callsite metadata (or summary) to the nodes created for allocation memprof MIB metadata,...
void handleCallsitesWithMultipleTargets()
Update graph to conservatively handle any callsite stack nodes that target multiple different callee ...
bool assignFunctions()
Assign callsite clones to functions, cloning functions as needed to accommodate the combinations of t...
friend raw_ostream & operator<<(raw_ostream &OS, const CallsiteContextGraph &CCG)
std::vector< uint64_t > getStackIdsWithContextNodes(CallStack< NodeT, IteratorT > &CallsiteContext)
Get a list of nodes corresponding to the stack ids in the given callsite context.
CallsiteContextGraph()=default
std::vector< std::pair< FuncTy *, std::vector< CallInfo > > > FuncToCallsWithMetadata
Save lists of calls with MemProf metadata in each function, for faster iteration.
bool process()
Main entry point to perform analysis and transformations on graph.
CallsiteContextGraph(const CallsiteContextGraph &)=default
void print(raw_ostream &OS) const
ContextNode * addAllocNode(CallInfo Call, const FuncTy *F)
Adds nodes for the given allocation and any stack ids on its memprof MIB metadata (or summary).
void exportToDot(std::string Label) const
CRTP derived class for graphs built from summary index (ThinLTO).
IndexCallsiteContextGraph(ModuleSummaryIndex &Index, function_ref< bool(GlobalValue::GUID, const GlobalValueSummary *)> isPrevailing)
CRTP derived class for graphs built from IR (regular LTO).
ModuleCallsiteContextGraph(Module &M, function_ref< OptimizationRemarkEmitter &(Function *)> OREGetter)
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:91
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1190
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1522
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1451
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Function summary information to aid decisions and implementation of importing.
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:506
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:404
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:591
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:168
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:55
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:933
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:74
Metadata node.
Definition: Metadata.h:950
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
void reserve(size_type N)
Definition: SmallVector.h:667
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:105
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
StringRef AttributeString(unsigned Attribute)
Definition: Dwarf.cpp:72
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FS
Definition: X86.h:208
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:63
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
Definition: SetOperations.h:82
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1833
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:23
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:60
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:292
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1186
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1846
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
#define N
Represents a callsite clone via CallTy and clone number pair.
friend raw_ostream & operator<<(raw_ostream &OS, const CallInfo &Call)
CallInfo(CallTy Call=nullptr, unsigned CloneNo=0)
Edge in the Callsite Context Graph from a ContextNode N to a caller or callee.
friend raw_ostream & operator<<(raw_ostream &OS, const ContextEdge &Edge)
ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, DenseSet< uint32_t > ContextIds)
Node in the Callsite Context Graph.
std::vector< std::shared_ptr< ContextEdge > > CalleeEdges
friend raw_ostream & operator<<(raw_ostream &OS, const ContextNode &Node)
void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, unsigned int ContextId)
ContextEdge * findEdgeFromCaller(const ContextNode *Caller)
ContextEdge * findEdgeFromCallee(const ContextNode *Callee)
std::vector< std::shared_ptr< ContextEdge > > CallerEdges
Represents a function clone via FuncTy pointer and clone number pair.
std::pair< FuncTy *, unsigned > Base
FuncInfo(FuncTy *F=nullptr, unsigned CloneNo=0)
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Represents a call in the summary index graph, which can either be an allocation or an interior callsi...
void print(raw_ostream &OS) const
IndexCall(CallsiteInfo *StackNode)
IndexCall(PointerUnion PT)
IndexCall(AllocInfo *AllocNode)
PointerUnion< CallsiteInfo *, AllocInfo * > getBase() const
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:51
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:80
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const