LLVM 18.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
36#include "llvm/IR/Constants.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
45#include "llvm/Transforms/IPO.h"
47#include <sstream>
48#include <vector>
49using namespace llvm;
50using namespace llvm::memprof;
51
52#define DEBUG_TYPE "memprof-context-disambiguation"
53
54STATISTIC(FunctionClonesAnalysis,
55 "Number of function clones created during whole program analysis");
56STATISTIC(FunctionClonesThinBackend,
57 "Number of function clones created during ThinLTO backend");
58STATISTIC(FunctionsClonedThinBackend,
59 "Number of functions that had clones created during ThinLTO backend");
60STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
61 "cloned) during whole program analysis");
62STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
63 "during whole program analysis");
64STATISTIC(AllocTypeNotColdThinBackend,
65 "Number of not cold static allocations (possibly cloned) during "
66 "ThinLTO backend");
67STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
68 "(possibly cloned) during ThinLTO backend");
69STATISTIC(OrigAllocsThinBackend,
70 "Number of original (not cloned) allocations with memprof profiles "
71 "during ThinLTO backend");
73 AllocVersionsThinBackend,
74 "Number of allocation versions (including clones) during ThinLTO backend");
75STATISTIC(MaxAllocVersionsThinBackend,
76 "Maximum number of allocation versions created for an original "
77 "allocation during ThinLTO backend");
78STATISTIC(UnclonableAllocsThinBackend,
79 "Number of unclonable ambigous allocations during ThinLTO backend");
80
82 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
83 cl::value_desc("filename"),
84 cl::desc("Specify the path prefix of the MemProf dot files."));
85
86static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
88 cl::desc("Export graph to dot files."));
89
90static cl::opt<bool>
91 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
92 cl::desc("Dump CallingContextGraph to stdout after each stage."));
93
94static cl::opt<bool>
95 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
96 cl::desc("Perform verification checks on CallingContextGraph."));
97
98static cl::opt<bool>
99 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
100 cl::desc("Perform frequent verification checks on nodes."));
101
103 "memprof-import-summary",
104 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
105 cl::Hidden);
106
107namespace llvm {
108// Indicate we are linking with an allocator that supports hot/cold operator
109// new interfaces.
111 "supports-hot-cold-new", cl::init(false), cl::Hidden,
112 cl::desc("Linking with hot/cold operator new interfaces"));
113} // namespace llvm
114
115namespace {
116/// CRTP base for graphs built from either IR or ThinLTO summary index.
117///
118/// The graph represents the call contexts in all memprof metadata on allocation
119/// calls, with nodes for the allocations themselves, as well as for the calls
120/// in each context. The graph is initially built from the allocation memprof
121/// metadata (or summary) MIBs. It is then updated to match calls with callsite
122/// metadata onto the nodes, updating it to reflect any inlining performed on
123/// those calls.
124///
125/// Each MIB (representing an allocation's call context with allocation
126/// behavior) is assigned a unique context id during the graph build. The edges
127/// and nodes in the graph are decorated with the context ids they carry. This
128/// is used to correctly update the graph when cloning is performed so that we
129/// can uniquify the context for a single (possibly cloned) allocation.
130template <typename DerivedCCG, typename FuncTy, typename CallTy>
131class CallsiteContextGraph {
132public:
133 CallsiteContextGraph() = default;
134 CallsiteContextGraph(const CallsiteContextGraph &) = default;
135 CallsiteContextGraph(CallsiteContextGraph &&) = default;
136
137 /// Main entry point to perform analysis and transformations on graph.
138 bool process();
139
140 /// Perform cloning on the graph necessary to uniquely identify the allocation
141 /// behavior of an allocation based on its context.
142 void identifyClones();
143
144 /// Assign callsite clones to functions, cloning functions as needed to
145 /// accommodate the combinations of their callsite clones reached by callers.
146 /// For regular LTO this clones functions and callsites in the IR, but for
147 /// ThinLTO the cloning decisions are noted in the summaries and later applied
148 /// in applyImport.
149 bool assignFunctions();
150
151 void dump() const;
152 void print(raw_ostream &OS) const;
153
155 const CallsiteContextGraph &CCG) {
156 CCG.print(OS);
157 return OS;
158 }
159
160 friend struct GraphTraits<
161 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
162 friend struct DOTGraphTraits<
163 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
164
165 void exportToDot(std::string Label) const;
166
167 /// Represents a function clone via FuncTy pointer and clone number pair.
168 struct FuncInfo final
169 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
170 using Base = std::pair<FuncTy *, unsigned>;
171 FuncInfo(const Base &B) : Base(B) {}
172 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
173 explicit operator bool() const { return this->first != nullptr; }
174 FuncTy *func() const { return this->first; }
175 unsigned cloneNo() const { return this->second; }
176 };
177
178 /// Represents a callsite clone via CallTy and clone number pair.
179 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
180 using Base = std::pair<CallTy, unsigned>;
181 CallInfo(const Base &B) : Base(B) {}
182 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
183 : Base(Call, CloneNo) {}
184 explicit operator bool() const { return (bool)this->first; }
185 CallTy call() const { return this->first; }
186 unsigned cloneNo() const { return this->second; }
187 void setCloneNo(unsigned N) { this->second = N; }
188 void print(raw_ostream &OS) const {
189 if (!operator bool()) {
190 assert(!cloneNo());
191 OS << "null Call";
192 return;
193 }
194 call()->print(OS);
195 OS << "\t(clone " << cloneNo() << ")";
196 }
197 void dump() const {
198 print(dbgs());
199 dbgs() << "\n";
200 }
201 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
202 Call.print(OS);
203 return OS;
204 }
205 };
206
207 struct ContextEdge;
208
209 /// Node in the Callsite Context Graph
210 struct ContextNode {
211 // Keep this for now since in the IR case where we have an Instruction* it
212 // is not as immediately discoverable. Used for printing richer information
213 // when dumping graph.
214 bool IsAllocation;
215
216 // Keeps track of when the Call was reset to null because there was
217 // recursion.
218 bool Recursive = false;
219
220 // The corresponding allocation or interior call.
222
223 // For alloc nodes this is a unique id assigned when constructed, and for
224 // callsite stack nodes it is the original stack id when the node is
225 // constructed from the memprof MIB metadata on the alloc nodes. Note that
226 // this is only used when matching callsite metadata onto the stack nodes
227 // created when processing the allocation memprof MIBs, and for labeling
228 // nodes in the dot graph. Therefore we don't bother to assign a value for
229 // clones.
230 uint64_t OrigStackOrAllocId = 0;
231
232 // This will be formed by ORing together the AllocationType enum values
233 // for contexts including this node.
234 uint8_t AllocTypes = 0;
235
236 // Edges to all callees in the profiled call stacks.
237 // TODO: Should this be a map (from Callee node) for more efficient lookup?
238 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
239
240 // Edges to all callers in the profiled call stacks.
241 // TODO: Should this be a map (from Caller node) for more efficient lookup?
242 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
243
244 // The set of IDs for contexts including this node.
245 DenseSet<uint32_t> ContextIds;
246
247 // List of clones of this ContextNode, initially empty.
248 std::vector<ContextNode *> Clones;
249
250 // If a clone, points to the original uncloned node.
251 ContextNode *CloneOf = nullptr;
252
253 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
254
255 ContextNode(bool IsAllocation, CallInfo C)
256 : IsAllocation(IsAllocation), Call(C) {}
257
258 void addClone(ContextNode *Clone) {
259 if (CloneOf) {
260 CloneOf->Clones.push_back(Clone);
261 Clone->CloneOf = CloneOf;
262 } else {
263 Clones.push_back(Clone);
264 assert(!Clone->CloneOf);
265 Clone->CloneOf = this;
266 }
267 }
268
269 ContextNode *getOrigNode() {
270 if (!CloneOf)
271 return this;
272 return CloneOf;
273 }
274
275 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
276 unsigned int ContextId);
277
278 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
279 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
280 void eraseCalleeEdge(const ContextEdge *Edge);
281 void eraseCallerEdge(const ContextEdge *Edge);
282
283 void setCall(CallInfo C) { Call = C; }
284
285 bool hasCall() const { return (bool)Call.call(); }
286
287 void printCall(raw_ostream &OS) const { Call.print(OS); }
288
289 // True if this node was effectively removed from the graph, in which case
290 // its context id set, caller edges, and callee edges should all be empty.
291 bool isRemoved() const {
292 assert(ContextIds.empty() ==
293 (CalleeEdges.empty() && CallerEdges.empty()));
294 return ContextIds.empty();
295 }
296
297 void dump() const;
298 void print(raw_ostream &OS) const;
299
300 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
301 Node.print(OS);
302 return OS;
303 }
304 };
305
306 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
307 /// callee.
308 struct ContextEdge {
309 ContextNode *Callee;
310 ContextNode *Caller;
311
312 // This will be formed by ORing together the AllocationType enum values
313 // for contexts including this edge.
314 uint8_t AllocTypes = 0;
315
316 // The set of IDs for contexts including this edge.
317 DenseSet<uint32_t> ContextIds;
318
319 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
320 DenseSet<uint32_t> ContextIds)
321 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
322 ContextIds(ContextIds) {}
323
324 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
325
326 void dump() const;
327 void print(raw_ostream &OS) const;
328
329 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
330 Edge.print(OS);
331 return OS;
332 }
333 };
334
335 /// Helper to remove callee edges that have allocation type None (due to not
336 /// carrying any context ids) after transformations.
337 void removeNoneTypeCalleeEdges(ContextNode *Node);
338
339protected:
340 /// Get a list of nodes corresponding to the stack ids in the given callsite
341 /// context.
342 template <class NodeT, class IteratorT>
343 std::vector<uint64_t>
344 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
345
346 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
347 /// metadata (or summary).
348 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
349
350 /// Adds nodes for the given MIB stack ids.
351 template <class NodeT, class IteratorT>
352 void addStackNodesForMIB(ContextNode *AllocNode,
353 CallStack<NodeT, IteratorT> &StackContext,
354 CallStack<NodeT, IteratorT> &CallsiteContext,
356
357 /// Matches all callsite metadata (or summary) to the nodes created for
358 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
359 /// inlining performed on those callsite instructions.
360 void updateStackNodes();
361
362 /// Update graph to conservatively handle any callsite stack nodes that target
363 /// multiple different callee target functions.
364 void handleCallsitesWithMultipleTargets();
365
366 /// Save lists of calls with MemProf metadata in each function, for faster
367 /// iteration.
368 std::vector<std::pair<FuncTy *, std::vector<CallInfo>>>
369 FuncToCallsWithMetadata;
370
371 /// Map from callsite node to the enclosing caller function.
372 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
373
374private:
375 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
376
377 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
378 const FuncTy *, DenseSet<uint32_t>>;
379
380 /// Assigns the given Node to calls at or inlined into the location with
381 /// the Node's stack id, after post order traversing and processing its
382 /// caller nodes. Uses the call information recorded in the given
383 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
384 /// as needed. Called by updateStackNodes which sets up the given
385 /// StackIdToMatchingCalls map.
386 void assignStackNodesPostOrder(
387 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
388 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
389
390 /// Duplicates the given set of context ids, updating the provided
391 /// map from each original id with the newly generated context ids,
392 /// and returning the new duplicated id set.
393 DenseSet<uint32_t> duplicateContextIds(
394 const DenseSet<uint32_t> &StackSequenceContextIds,
395 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
396
397 /// Propagates all duplicated context ids across the graph.
398 void propagateDuplicateContextIds(
399 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
400
401 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
402 /// else to its callers. Also updates OrigNode's edges to remove any context
403 /// ids moved to the newly created edge.
404 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
405 bool TowardsCallee);
406
407 /// Get the stack id corresponding to the given Id or Index (for IR this will
408 /// return itself, for a summary index this will return the id recorded in the
409 /// index for that stack id index value).
410 uint64_t getStackId(uint64_t IdOrIndex) const {
411 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
412 }
413
414 /// Returns true if the given call targets the given function.
415 bool calleeMatchesFunc(CallTy Call, const FuncTy *Func) {
416 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(Call, Func);
417 }
418
419 /// Get a list of nodes corresponding to the stack ids in the given
420 /// callsite's context.
421 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
422 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
423 Call);
424 }
425
426 /// Get the last stack id in the context for callsite.
427 uint64_t getLastStackId(CallTy Call) {
428 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
429 }
430
431 /// Update the allocation call to record type of allocated memory.
432 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
433 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
434 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
435 }
436
437 /// Update non-allocation call to invoke (possibly cloned) function
438 /// CalleeFunc.
439 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
440 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
441 }
442
443 /// Clone the given function for the given callsite, recording mapping of all
444 /// of the functions tracked calls to their new versions in the CallMap.
445 /// Assigns new clones to clone number CloneNo.
446 FuncInfo cloneFunctionForCallsite(
447 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
448 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
449 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
450 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
451 }
452
453 /// Gets a label to use in the dot graph for the given call clone in the given
454 /// function.
455 std::string getLabel(const FuncTy *Func, const CallTy Call,
456 unsigned CloneNo) const {
457 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
458 }
459
460 /// Helpers to find the node corresponding to the given call or stackid.
461 ContextNode *getNodeForInst(const CallInfo &C);
462 ContextNode *getNodeForAlloc(const CallInfo &C);
463 ContextNode *getNodeForStackId(uint64_t StackId);
464
465 /// Removes the node information recorded for the given call.
466 void unsetNodeForInst(const CallInfo &C);
467
468 /// Computes the alloc type corresponding to the given context ids, by
469 /// unioning their recorded alloc types.
470 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
471
472 /// Returns the alloction type of the intersection of the contexts of two
473 /// nodes (based on their provided context id sets), optimized for the case
474 /// when Node1Ids is smaller than Node2Ids.
475 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
476 const DenseSet<uint32_t> &Node2Ids);
477
478 /// Returns the alloction type of the intersection of the contexts of two
479 /// nodes (based on their provided context id sets).
480 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
481 const DenseSet<uint32_t> &Node2Ids);
482
483 /// Create a clone of Edge's callee and move Edge to that new callee node,
484 /// performing the necessary context id and allocation type updates.
485 /// If callee's caller edge iterator is supplied, it is updated when removing
486 /// the edge from that list.
487 ContextNode *
488 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
489 EdgeIter *CallerEdgeI = nullptr);
490
491 /// Change the callee of Edge to existing callee clone NewCallee, performing
492 /// the necessary context id and allocation type updates.
493 /// If callee's caller edge iterator is supplied, it is updated when removing
494 /// the edge from that list.
495 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
496 ContextNode *NewCallee,
497 EdgeIter *CallerEdgeI = nullptr,
498 bool NewClone = false);
499
500 /// Recursively perform cloning on the graph for the given Node and its
501 /// callers, in order to uniquely identify the allocation behavior of an
502 /// allocation given its context.
503 void identifyClones(ContextNode *Node,
505
506 /// Map from each context ID to the AllocationType assigned to that context.
507 std::map<uint32_t, AllocationType> ContextIdToAllocationType;
508
509 /// Identifies the context node created for a stack id when adding the MIB
510 /// contexts to the graph. This is used to locate the context nodes when
511 /// trying to assign the corresponding callsites with those stack ids to these
512 /// nodes.
513 std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
514
515 /// Maps to track the calls to their corresponding nodes in the graph.
516 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
517 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
518
519 /// Owner of all ContextNode unique_ptrs.
520 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
521
522 /// Perform sanity checks on graph when requested.
523 void check() const;
524
525 /// Keeps track of the last unique context id assigned.
526 unsigned int LastContextId = 0;
527};
528
529template <typename DerivedCCG, typename FuncTy, typename CallTy>
530using ContextNode =
531 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
532template <typename DerivedCCG, typename FuncTy, typename CallTy>
533using ContextEdge =
534 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
535template <typename DerivedCCG, typename FuncTy, typename CallTy>
536using FuncInfo =
537 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
538template <typename DerivedCCG, typename FuncTy, typename CallTy>
539using CallInfo =
540 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
541
542/// CRTP derived class for graphs built from IR (regular LTO).
543class ModuleCallsiteContextGraph
544 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
545 Instruction *> {
546public:
547 ModuleCallsiteContextGraph(
548 Module &M,
550
551private:
552 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
553 Instruction *>;
554
555 uint64_t getStackId(uint64_t IdOrIndex) const;
556 bool calleeMatchesFunc(Instruction *Call, const Function *Func);
557 uint64_t getLastStackId(Instruction *Call);
558 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
559 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
560 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
561 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
562 Instruction *>::FuncInfo
563 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
564 std::map<CallInfo, CallInfo> &CallMap,
565 std::vector<CallInfo> &CallsWithMetadataInFunc,
566 unsigned CloneNo);
567 std::string getLabel(const Function *Func, const Instruction *Call,
568 unsigned CloneNo) const;
569
570 const Module &Mod;
572};
573
574/// Represents a call in the summary index graph, which can either be an
575/// allocation or an interior callsite node in an allocation's context.
576/// Holds a pointer to the corresponding data structure in the index.
577struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
578 IndexCall() : PointerUnion() {}
579 IndexCall(std::nullptr_t) : IndexCall() {}
580 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
581 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
582 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
583
584 IndexCall *operator->() { return this; }
585
586 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
587
588 void print(raw_ostream &OS) const {
589 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
590 OS << *AI;
591 } else {
592 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
593 assert(CI);
594 OS << *CI;
595 }
596 }
597};
598
599/// CRTP derived class for graphs built from summary index (ThinLTO).
600class IndexCallsiteContextGraph
601 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
602 IndexCall> {
603public:
604 IndexCallsiteContextGraph(
607 isPrevailing);
608
609private:
610 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
611 IndexCall>;
612
613 uint64_t getStackId(uint64_t IdOrIndex) const;
614 bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
615 uint64_t getLastStackId(IndexCall &Call);
616 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
617 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
618 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
619 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
620 IndexCall>::FuncInfo
621 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
622 std::map<CallInfo, CallInfo> &CallMap,
623 std::vector<CallInfo> &CallsWithMetadataInFunc,
624 unsigned CloneNo);
625 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
626 unsigned CloneNo) const;
627
628 // Saves mapping from function summaries containing memprof records back to
629 // its VI, for use in checking and debugging.
630 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
631
633};
634} // namespace
635
636namespace llvm {
637template <>
638struct DenseMapInfo<typename CallsiteContextGraph<
639 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
641template <>
642struct DenseMapInfo<typename CallsiteContextGraph<
643 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
644 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
645template <>
646struct DenseMapInfo<IndexCall>
647 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
648} // end namespace llvm
649
650namespace {
651
652struct FieldSeparator {
653 bool Skip = true;
654 const char *Sep;
655
656 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
657};
658
659raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
660 if (FS.Skip) {
661 FS.Skip = false;
662 return OS;
663 }
664 return OS << FS.Sep;
665}
666
667// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
668// type we should actually use on the corresponding allocation.
669// If we can't clone a node that has NotCold+Cold alloc type, we will fall
670// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
671// from NotCold.
672AllocationType allocTypeToUse(uint8_t AllocTypes) {
673 assert(AllocTypes != (uint8_t)AllocationType::None);
674 if (AllocTypes ==
675 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
676 return AllocationType::NotCold;
677 else
678 return (AllocationType)AllocTypes;
679}
680
681// Helper to check if the alloc types for all edges recorded in the
682// InAllocTypes vector match the alloc types for all edges in the Edges
683// vector.
684template <typename DerivedCCG, typename FuncTy, typename CallTy>
685bool allocTypesMatch(
686 const std::vector<uint8_t> &InAllocTypes,
687 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
688 &Edges) {
689 return std::equal(
690 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
691 [](const uint8_t &l,
692 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
693 // Can share if one of the edges is None type - don't
694 // care about the type along that edge as it doesn't
695 // exist for those context ids.
696 if (l == (uint8_t)AllocationType::None ||
697 r->AllocTypes == (uint8_t)AllocationType::None)
698 return true;
699 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
700 });
701}
702
703} // end anonymous namespace
704
705template <typename DerivedCCG, typename FuncTy, typename CallTy>
706typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
707CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
708 const CallInfo &C) {
709 ContextNode *Node = getNodeForAlloc(C);
710 if (Node)
711 return Node;
712
713 return NonAllocationCallToContextNodeMap.lookup(C);
714}
715
716template <typename DerivedCCG, typename FuncTy, typename CallTy>
717typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
718CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
719 const CallInfo &C) {
720 return AllocationCallToContextNodeMap.lookup(C);
721}
722
723template <typename DerivedCCG, typename FuncTy, typename CallTy>
724typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
725CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
726 uint64_t StackId) {
727 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
728 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
729 return StackEntryNode->second;
730 return nullptr;
731}
732
733template <typename DerivedCCG, typename FuncTy, typename CallTy>
734void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
735 const CallInfo &C) {
736 AllocationCallToContextNodeMap.erase(C) ||
737 NonAllocationCallToContextNodeMap.erase(C);
738 assert(!AllocationCallToContextNodeMap.count(C) &&
739 !NonAllocationCallToContextNodeMap.count(C));
740}
741
742template <typename DerivedCCG, typename FuncTy, typename CallTy>
743void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
744 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
745 unsigned int ContextId) {
746 for (auto &Edge : CallerEdges) {
747 if (Edge->Caller == Caller) {
748 Edge->AllocTypes |= (uint8_t)AllocType;
749 Edge->getContextIds().insert(ContextId);
750 return;
751 }
752 }
753 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
754 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
755 CallerEdges.push_back(Edge);
756 Caller->CalleeEdges.push_back(Edge);
757}
758
759template <typename DerivedCCG, typename FuncTy, typename CallTy>
760void CallsiteContextGraph<
761 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
762 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
763 auto Edge = *EI;
764 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
765 assert(Edge->ContextIds.empty());
766 Edge->Callee->eraseCallerEdge(Edge.get());
767 EI = Node->CalleeEdges.erase(EI);
768 } else
769 ++EI;
770 }
771}
772
773template <typename DerivedCCG, typename FuncTy, typename CallTy>
774typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
775CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
776 findEdgeFromCallee(const ContextNode *Callee) {
777 for (const auto &Edge : CalleeEdges)
778 if (Edge->Callee == Callee)
779 return Edge.get();
780 return nullptr;
781}
782
783template <typename DerivedCCG, typename FuncTy, typename CallTy>
784typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
785CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
786 findEdgeFromCaller(const ContextNode *Caller) {
787 for (const auto &Edge : CallerEdges)
788 if (Edge->Caller == Caller)
789 return Edge.get();
790 return nullptr;
791}
792
793template <typename DerivedCCG, typename FuncTy, typename CallTy>
794void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
795 eraseCalleeEdge(const ContextEdge *Edge) {
796 auto EI = llvm::find_if(
797 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
798 return CalleeEdge.get() == Edge;
799 });
800 assert(EI != CalleeEdges.end());
801 CalleeEdges.erase(EI);
802}
803
804template <typename DerivedCCG, typename FuncTy, typename CallTy>
805void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
806 eraseCallerEdge(const ContextEdge *Edge) {
807 auto EI = llvm::find_if(
808 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
809 return CallerEdge.get() == Edge;
810 });
811 assert(EI != CallerEdges.end());
812 CallerEdges.erase(EI);
813}
814
815template <typename DerivedCCG, typename FuncTy, typename CallTy>
816uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
817 DenseSet<uint32_t> &ContextIds) {
818 uint8_t BothTypes =
819 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
820 uint8_t AllocType = (uint8_t)AllocationType::None;
821 for (auto Id : ContextIds) {
822 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
823 // Bail early if alloc type reached both, no further refinement.
824 if (AllocType == BothTypes)
825 return AllocType;
826 }
827 return AllocType;
828}
829
830template <typename DerivedCCG, typename FuncTy, typename CallTy>
831uint8_t
832CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
833 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
834 uint8_t BothTypes =
835 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
836 uint8_t AllocType = (uint8_t)AllocationType::None;
837 for (auto Id : Node1Ids) {
838 if (!Node2Ids.count(Id))
839 continue;
840 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
841 // Bail early if alloc type reached both, no further refinement.
842 if (AllocType == BothTypes)
843 return AllocType;
844 }
845 return AllocType;
846}
847
848template <typename DerivedCCG, typename FuncTy, typename CallTy>
849uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
850 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
851 if (Node1Ids.size() < Node2Ids.size())
852 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
853 else
854 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
855}
856
857template <typename DerivedCCG, typename FuncTy, typename CallTy>
858typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
859CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
860 CallInfo Call, const FuncTy *F) {
861 assert(!getNodeForAlloc(Call));
862 NodeOwner.push_back(
863 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
864 ContextNode *AllocNode = NodeOwner.back().get();
865 AllocationCallToContextNodeMap[Call] = AllocNode;
866 NodeToCallingFunc[AllocNode] = F;
867 // Use LastContextId as a uniq id for MIB allocation nodes.
868 AllocNode->OrigStackOrAllocId = LastContextId;
869 // Alloc type should be updated as we add in the MIBs. We should assert
870 // afterwards that it is not still None.
871 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
872
873 return AllocNode;
874}
875
876template <typename DerivedCCG, typename FuncTy, typename CallTy>
877template <class NodeT, class IteratorT>
878void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
879 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
881 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
882 // is done.
883 if (AllocType == AllocationType::Hot)
884 AllocType = AllocationType::NotCold;
885
886 ContextIdToAllocationType[++LastContextId] = AllocType;
887
888 // Update alloc type and context ids for this MIB.
889 AllocNode->AllocTypes |= (uint8_t)AllocType;
890 AllocNode->ContextIds.insert(LastContextId);
891
892 // Now add or update nodes for each stack id in alloc's context.
893 // Later when processing the stack ids on non-alloc callsites we will adjust
894 // for any inlining in the context.
895 ContextNode *PrevNode = AllocNode;
896 // Look for recursion (direct recursion should have been collapsed by
897 // module summary analysis, here we should just be detecting mutual
898 // recursion). Mark these nodes so we don't try to clone.
899 SmallSet<uint64_t, 8> StackIdSet;
900 // Skip any on the allocation call (inlining).
901 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
902 ContextIter != StackContext.end(); ++ContextIter) {
903 auto StackId = getStackId(*ContextIter);
904 ContextNode *StackNode = getNodeForStackId(StackId);
905 if (!StackNode) {
906 NodeOwner.push_back(
907 std::make_unique<ContextNode>(/*IsAllocation=*/false));
908 StackNode = NodeOwner.back().get();
909 StackEntryIdToContextNodeMap[StackId] = StackNode;
910 StackNode->OrigStackOrAllocId = StackId;
911 }
912 auto Ins = StackIdSet.insert(StackId);
913 if (!Ins.second)
914 StackNode->Recursive = true;
915 StackNode->ContextIds.insert(LastContextId);
916 StackNode->AllocTypes |= (uint8_t)AllocType;
917 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
918 PrevNode = StackNode;
919 }
920}
921
922template <typename DerivedCCG, typename FuncTy, typename CallTy>
924CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
925 const DenseSet<uint32_t> &StackSequenceContextIds,
926 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
927 DenseSet<uint32_t> NewContextIds;
928 for (auto OldId : StackSequenceContextIds) {
929 NewContextIds.insert(++LastContextId);
930 OldToNewContextIds[OldId].insert(LastContextId);
931 assert(ContextIdToAllocationType.count(OldId));
932 // The new context has the same allocation type as original.
933 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
934 }
935 return NewContextIds;
936}
937
938template <typename DerivedCCG, typename FuncTy, typename CallTy>
939void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
940 propagateDuplicateContextIds(
941 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
942 // Build a set of duplicated context ids corresponding to the input id set.
943 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
944 DenseSet<uint32_t> NewIds;
945 for (auto Id : ContextIds)
946 if (auto NewId = OldToNewContextIds.find(Id);
947 NewId != OldToNewContextIds.end())
948 NewIds.insert(NewId->second.begin(), NewId->second.end());
949 return NewIds;
950 };
951
952 // Recursively update context ids sets along caller edges.
953 auto UpdateCallers = [&](ContextNode *Node,
955 auto &&UpdateCallers) -> void {
956 for (const auto &Edge : Node->CallerEdges) {
957 auto Inserted = Visited.insert(Edge.get());
958 if (!Inserted.second)
959 continue;
960 ContextNode *NextNode = Edge->Caller;
961 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
962 // Only need to recursively iterate to NextNode via this caller edge if
963 // it resulted in any added ids to NextNode.
964 if (!NewIdsToAdd.empty()) {
965 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
966 NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
967 UpdateCallers(NextNode, Visited, UpdateCallers);
968 }
969 }
970 };
971
973 for (auto &Entry : AllocationCallToContextNodeMap) {
974 auto *Node = Entry.second;
975 // Update ids on the allocation nodes before calling the recursive
976 // update along caller edges, since this simplifies the logic during
977 // that traversal.
978 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
979 Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
980 UpdateCallers(Node, Visited, UpdateCallers);
981 }
982}
983
984template <typename DerivedCCG, typename FuncTy, typename CallTy>
985void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
986 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
987 // Make a copy of the context ids, since this will be adjusted below as they
988 // are moved.
989 DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
990 auto &OrigEdges =
991 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
992 // Increment iterator in loop so that we can remove edges as needed.
993 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
994 auto Edge = *EI;
995 // Remove any matching context ids from Edge, return set that were found and
996 // removed, these are the new edge's context ids. Also update the remaining
997 // (not found ids).
998 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
999 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1000 NotFoundContextIds);
1001 RemainingContextIds.swap(NotFoundContextIds);
1002 // If no matching context ids for this edge, skip it.
1003 if (NewEdgeContextIds.empty()) {
1004 ++EI;
1005 continue;
1006 }
1007 if (TowardsCallee) {
1008 auto NewEdge = std::make_shared<ContextEdge>(
1009 Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
1010 NewEdgeContextIds);
1011 NewNode->CalleeEdges.push_back(NewEdge);
1012 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1013 } else {
1014 auto NewEdge = std::make_shared<ContextEdge>(
1015 NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
1016 NewEdgeContextIds);
1017 NewNode->CallerEdges.push_back(NewEdge);
1018 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1019 }
1020 // Remove old edge if context ids empty.
1021 if (Edge->getContextIds().empty()) {
1022 if (TowardsCallee) {
1023 Edge->Callee->eraseCallerEdge(Edge.get());
1024 EI = OrigNode->CalleeEdges.erase(EI);
1025 } else {
1026 Edge->Caller->eraseCalleeEdge(Edge.get());
1027 EI = OrigNode->CallerEdges.erase(EI);
1028 }
1029 continue;
1030 }
1031 ++EI;
1032 }
1033}
1034
1035template <typename DerivedCCG, typename FuncTy, typename CallTy>
1036void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1037 assignStackNodesPostOrder(ContextNode *Node,
1039 DenseMap<uint64_t, std::vector<CallContextInfo>>
1040 &StackIdToMatchingCalls) {
1041 auto Inserted = Visited.insert(Node);
1042 if (!Inserted.second)
1043 return;
1044 // Post order traversal. Iterate over a copy since we may add nodes and
1045 // therefore new callers during the recursive call, invalidating any
1046 // iterator over the original edge vector. We don't need to process these
1047 // new nodes as they were already processed on creation.
1048 auto CallerEdges = Node->CallerEdges;
1049 for (auto &Edge : CallerEdges) {
1050 // Skip any that have been removed during the recursion.
1051 if (!Edge)
1052 continue;
1053 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1054 }
1055
1056 // If this node's stack id is in the map, update the graph to contain new
1057 // nodes representing any inlining at interior callsites. Note we move the
1058 // associated context ids over to the new nodes.
1059
1060 // Ignore this node if it is for an allocation or we didn't record any
1061 // stack id lists ending at it.
1062 if (Node->IsAllocation ||
1063 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1064 return;
1065
1066 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1067 // Handle the simple case first. A single call with a single stack id.
1068 // In this case there is no need to create any new context nodes, simply
1069 // assign the context node for stack id to this Call.
1070 if (Calls.size() == 1) {
1071 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1072 if (Ids.size() == 1) {
1073 assert(SavedContextIds.empty());
1074 // It should be this Node
1075 assert(Node == getNodeForStackId(Ids[0]));
1076 if (Node->Recursive)
1077 return;
1078 Node->setCall(Call);
1079 NonAllocationCallToContextNodeMap[Call] = Node;
1080 NodeToCallingFunc[Node] = Func;
1081 return;
1082 }
1083 }
1084
1085 // Find the node for the last stack id, which should be the same
1086 // across all calls recorded for this id, and is this node's id.
1087 uint64_t LastId = Node->OrigStackOrAllocId;
1088 ContextNode *LastNode = getNodeForStackId(LastId);
1089 // We should only have kept stack ids that had nodes.
1090 assert(LastNode);
1091
1092 for (unsigned I = 0; I < Calls.size(); I++) {
1093 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1094 // Skip any for which we didn't assign any ids, these don't get a node in
1095 // the graph.
1096 if (SavedContextIds.empty())
1097 continue;
1098
1099 assert(LastId == Ids.back());
1100
1101 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1102 assert(FirstNode);
1103
1104 // Recompute the context ids for this stack id sequence (the
1105 // intersection of the context ids of the corresponding nodes).
1106 // Start with the ids we saved in the map for this call, which could be
1107 // duplicated context ids. We have to recompute as we might have overlap
1108 // overlap between the saved context ids for different last nodes, and
1109 // removed them already during the post order traversal.
1110 set_intersect(SavedContextIds, FirstNode->ContextIds);
1111 ContextNode *PrevNode = nullptr;
1112 for (auto Id : Ids) {
1113 ContextNode *CurNode = getNodeForStackId(Id);
1114 // We should only have kept stack ids that had nodes and weren't
1115 // recursive.
1116 assert(CurNode);
1117 assert(!CurNode->Recursive);
1118 if (!PrevNode) {
1119 PrevNode = CurNode;
1120 continue;
1121 }
1122 auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1123 if (!Edge) {
1124 SavedContextIds.clear();
1125 break;
1126 }
1127 PrevNode = CurNode;
1128 set_intersect(SavedContextIds, Edge->getContextIds());
1129
1130 // If we now have no context ids for clone, skip this call.
1131 if (SavedContextIds.empty())
1132 break;
1133 }
1134 if (SavedContextIds.empty())
1135 continue;
1136
1137 // Create new context node.
1138 NodeOwner.push_back(
1139 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1140 ContextNode *NewNode = NodeOwner.back().get();
1141 NodeToCallingFunc[NewNode] = Func;
1142 NonAllocationCallToContextNodeMap[Call] = NewNode;
1143 NewNode->ContextIds = SavedContextIds;
1144 NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
1145
1146 // Connect to callees of innermost stack frame in inlined call chain.
1147 // This updates context ids for FirstNode's callee's to reflect those
1148 // moved to NewNode.
1149 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
1150
1151 // Connect to callers of outermost stack frame in inlined call chain.
1152 // This updates context ids for FirstNode's caller's to reflect those
1153 // moved to NewNode.
1154 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
1155
1156 // Now we need to remove context ids from edges/nodes between First and
1157 // Last Node.
1158 PrevNode = nullptr;
1159 for (auto Id : Ids) {
1160 ContextNode *CurNode = getNodeForStackId(Id);
1161 // We should only have kept stack ids that had nodes.
1162 assert(CurNode);
1163
1164 // Remove the context ids moved to NewNode from CurNode, and the
1165 // edge from the prior node.
1166 set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1167 if (PrevNode) {
1168 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1169 assert(PrevEdge);
1170 set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1171 if (PrevEdge->getContextIds().empty()) {
1172 PrevNode->eraseCallerEdge(PrevEdge);
1173 CurNode->eraseCalleeEdge(PrevEdge);
1174 }
1175 }
1176 PrevNode = CurNode;
1177 }
1178 }
1179}
1180
1181template <typename DerivedCCG, typename FuncTy, typename CallTy>
1182void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1183 // Map of stack id to all calls with that as the last (outermost caller)
1184 // callsite id that has a context node (some might not due to pruning
1185 // performed during matching of the allocation profile contexts).
1186 // The CallContextInfo contains the Call and a list of its stack ids with
1187 // ContextNodes, the function containing Call, and the set of context ids
1188 // the analysis will eventually identify for use in any new node created
1189 // for that callsite.
1191 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1192 for (auto &Call : CallsWithMetadata) {
1193 // Ignore allocations, already handled.
1194 if (AllocationCallToContextNodeMap.count(Call))
1195 continue;
1196 auto StackIdsWithContextNodes =
1197 getStackIdsWithContextNodesForCall(Call.call());
1198 // If there were no nodes created for MIBs on allocs (maybe this was in
1199 // the unambiguous part of the MIB stack that was pruned), ignore.
1200 if (StackIdsWithContextNodes.empty())
1201 continue;
1202 // Otherwise, record this Call along with the list of ids for the last
1203 // (outermost caller) stack id with a node.
1204 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1205 {Call.call(), StackIdsWithContextNodes, Func, {}});
1206 }
1207 }
1208
1209 // First make a pass through all stack ids that correspond to a call,
1210 // as identified in the above loop. Compute the context ids corresponding to
1211 // each of these calls when they correspond to multiple stack ids due to
1212 // due to inlining. Perform any duplication of context ids required when
1213 // there is more than one call with the same stack ids. Their (possibly newly
1214 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1215 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1216 for (auto &It : StackIdToMatchingCalls) {
1217 auto &Calls = It.getSecond();
1218 // Skip single calls with a single stack id. These don't need a new node.
1219 if (Calls.size() == 1) {
1220 auto &Ids = std::get<1>(Calls[0]);
1221 if (Ids.size() == 1)
1222 continue;
1223 }
1224 // In order to do the best and maximal matching of inlined calls to context
1225 // node sequences we will sort the vectors of stack ids in descending order
1226 // of length, and within each length, lexicographically by stack id. The
1227 // latter is so that we can specially handle calls that have identical stack
1228 // id sequences (either due to cloning or artificially because of the MIB
1229 // context pruning).
1230 std::stable_sort(Calls.begin(), Calls.end(),
1231 [](const CallContextInfo &A, const CallContextInfo &B) {
1232 auto &IdsA = std::get<1>(A);
1233 auto &IdsB = std::get<1>(B);
1234 return IdsA.size() > IdsB.size() ||
1235 (IdsA.size() == IdsB.size() && IdsA < IdsB);
1236 });
1237
1238 // Find the node for the last stack id, which should be the same
1239 // across all calls recorded for this id, and is the id for this
1240 // entry in the StackIdToMatchingCalls map.
1241 uint64_t LastId = It.getFirst();
1242 ContextNode *LastNode = getNodeForStackId(LastId);
1243 // We should only have kept stack ids that had nodes.
1244 assert(LastNode);
1245
1246 if (LastNode->Recursive)
1247 continue;
1248
1249 // Initialize the context ids with the last node's. We will subsequently
1250 // refine the context ids by computing the intersection along all edges.
1251 DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1252 assert(!LastNodeContextIds.empty());
1253
1254 for (unsigned I = 0; I < Calls.size(); I++) {
1255 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1256 assert(SavedContextIds.empty());
1257 assert(LastId == Ids.back());
1258
1259 // First compute the context ids for this stack id sequence (the
1260 // intersection of the context ids of the corresponding nodes).
1261 // Start with the remaining saved ids for the last node.
1262 assert(!LastNodeContextIds.empty());
1263 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1264
1265 ContextNode *PrevNode = LastNode;
1266 ContextNode *CurNode = LastNode;
1267 bool Skip = false;
1268
1269 // Iterate backwards through the stack Ids, starting after the last Id
1270 // in the list, which was handled once outside for all Calls.
1271 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1272 auto Id = *IdIter;
1273 CurNode = getNodeForStackId(Id);
1274 // We should only have kept stack ids that had nodes.
1275 assert(CurNode);
1276
1277 if (CurNode->Recursive) {
1278 Skip = true;
1279 break;
1280 }
1281
1282 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1283 // If there is no edge then the nodes belong to different MIB contexts,
1284 // and we should skip this inlined context sequence. For example, this
1285 // particular inlined context may include stack ids A->B, and we may
1286 // indeed have nodes for both A and B, but it is possible that they were
1287 // never profiled in sequence in a single MIB for any allocation (i.e.
1288 // we might have profiled an allocation that involves the callsite A,
1289 // but through a different one of its callee callsites, and we might
1290 // have profiled an allocation that involves callsite B, but reached
1291 // from a different caller callsite).
1292 if (!Edge) {
1293 Skip = true;
1294 break;
1295 }
1296 PrevNode = CurNode;
1297
1298 // Update the context ids, which is the intersection of the ids along
1299 // all edges in the sequence.
1300 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1301
1302 // If we now have no context ids for clone, skip this call.
1303 if (StackSequenceContextIds.empty()) {
1304 Skip = true;
1305 break;
1306 }
1307 }
1308 if (Skip)
1309 continue;
1310
1311 // If some of this call's stack ids did not have corresponding nodes (due
1312 // to pruning), don't include any context ids for contexts that extend
1313 // beyond these nodes. Otherwise we would be matching part of unrelated /
1314 // not fully matching stack contexts. To do this, subtract any context ids
1315 // found in caller nodes of the last node found above.
1316 if (Ids.back() != getLastStackId(Call)) {
1317 for (const auto &PE : CurNode->CallerEdges) {
1318 set_subtract(StackSequenceContextIds, PE->getContextIds());
1319 if (StackSequenceContextIds.empty())
1320 break;
1321 }
1322 // If we now have no context ids for clone, skip this call.
1323 if (StackSequenceContextIds.empty())
1324 continue;
1325 }
1326
1327 // Check if the next set of stack ids is the same (since the Calls vector
1328 // of tuples is sorted by the stack ids we can just look at the next one).
1329 bool DuplicateContextIds = false;
1330 if (I + 1 < Calls.size()) {
1331 auto NextIds = std::get<1>(Calls[I + 1]);
1332 DuplicateContextIds = Ids == NextIds;
1333 }
1334
1335 // If we don't have duplicate context ids, then we can assign all the
1336 // context ids computed for the original node sequence to this call.
1337 // If there are duplicate calls with the same stack ids then we synthesize
1338 // new context ids that are duplicates of the originals. These are
1339 // assigned to SavedContextIds, which is a reference into the map entry
1340 // for this call, allowing us to access these ids later on.
1341 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1342 StackSequenceContextIds.size());
1343 SavedContextIds =
1344 DuplicateContextIds
1345 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1346 : StackSequenceContextIds;
1347 assert(!SavedContextIds.empty());
1348
1349 if (!DuplicateContextIds) {
1350 // Update saved last node's context ids to remove those that are
1351 // assigned to other calls, so that it is ready for the next call at
1352 // this stack id.
1353 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1354 if (LastNodeContextIds.empty())
1355 break;
1356 }
1357 }
1358 }
1359
1360 // Propagate the duplicate context ids over the graph.
1361 propagateDuplicateContextIds(OldToNewContextIds);
1362
1363 if (VerifyCCG)
1364 check();
1365
1366 // Now perform a post-order traversal over the graph, starting with the
1367 // allocation nodes, essentially processing nodes from callers to callees.
1368 // For any that contains an id in the map, update the graph to contain new
1369 // nodes representing any inlining at interior callsites. Note we move the
1370 // associated context ids over to the new nodes.
1372 for (auto &Entry : AllocationCallToContextNodeMap)
1373 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1374}
1375
1376uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1378 Call->getMetadata(LLVMContext::MD_callsite));
1379 return CallsiteContext.back();
1380}
1381
1382uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1383 assert(isa<CallsiteInfo *>(Call.getBase()));
1385 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1386 // Need to convert index into stack id.
1387 return Index.getStackIdAtIndex(CallsiteContext.back());
1388}
1389
1390static const std::string MemProfCloneSuffix = ".memprof.";
1391
1392static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1393 // We use CloneNo == 0 to refer to the original version, which doesn't get
1394 // renamed with a suffix.
1395 if (!CloneNo)
1396 return Base.str();
1397 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1398}
1399
1400std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1401 const Instruction *Call,
1402 unsigned CloneNo) const {
1403 return (Twine(Call->getFunction()->getName()) + " -> " +
1404 cast<CallBase>(Call)->getCalledFunction()->getName())
1405 .str();
1406}
1407
1408std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1409 const IndexCall &Call,
1410 unsigned CloneNo) const {
1411 auto VI = FSToVIMap.find(Func);
1412 assert(VI != FSToVIMap.end());
1413 if (isa<AllocInfo *>(Call.getBase()))
1414 return (VI->second.name() + " -> alloc").str();
1415 else {
1416 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1417 return (VI->second.name() + " -> " +
1418 getMemProfFuncName(Callsite->Callee.name(),
1419 Callsite->Clones[CloneNo]))
1420 .str();
1421 }
1422}
1423
1424std::vector<uint64_t>
1425ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1426 Instruction *Call) {
1428 Call->getMetadata(LLVMContext::MD_callsite));
1429 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1430 CallsiteContext);
1431}
1432
1433std::vector<uint64_t>
1434IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1435 assert(isa<CallsiteInfo *>(Call.getBase()));
1437 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1438 return getStackIdsWithContextNodes<CallsiteInfo,
1440 CallsiteContext);
1441}
1442
1443template <typename DerivedCCG, typename FuncTy, typename CallTy>
1444template <class NodeT, class IteratorT>
1445std::vector<uint64_t>
1446CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1447 CallStack<NodeT, IteratorT> &CallsiteContext) {
1448 std::vector<uint64_t> StackIds;
1449 for (auto IdOrIndex : CallsiteContext) {
1450 auto StackId = getStackId(IdOrIndex);
1451 ContextNode *Node = getNodeForStackId(StackId);
1452 if (!Node)
1453 break;
1454 StackIds.push_back(StackId);
1455 }
1456 return StackIds;
1457}
1458
1459ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1461 : Mod(M), OREGetter(OREGetter) {
1462 for (auto &F : M) {
1463 std::vector<CallInfo> CallsWithMetadata;
1464 for (auto &BB : F) {
1465 for (auto &I : BB) {
1466 if (!isa<CallBase>(I))
1467 continue;
1468 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1469 CallsWithMetadata.push_back(&I);
1470 auto *AllocNode = addAllocNode(&I, &F);
1471 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1472 assert(CallsiteMD);
1473 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1474 // Add all of the MIBs and their stack nodes.
1475 for (auto &MDOp : MemProfMD->operands()) {
1476 auto *MIBMD = cast<const MDNode>(MDOp);
1477 MDNode *StackNode = getMIBStackNode(MIBMD);
1478 assert(StackNode);
1479 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1480 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1481 AllocNode, StackContext, CallsiteContext,
1482 getMIBAllocType(MIBMD));
1483 }
1484 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1485 // Memprof and callsite metadata on memory allocations no longer
1486 // needed.
1487 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1488 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1489 }
1490 // For callsite metadata, add to list for this function for later use.
1491 else if (I.getMetadata(LLVMContext::MD_callsite))
1492 CallsWithMetadata.push_back(&I);
1493 }
1494 }
1495 if (!CallsWithMetadata.empty())
1496 FuncToCallsWithMetadata.push_back({&F, CallsWithMetadata});
1497 }
1498
1499 if (DumpCCG) {
1500 dbgs() << "CCG before updating call stack chains:\n";
1501 dbgs() << *this;
1502 }
1503
1504 if (ExportToDot)
1505 exportToDot("prestackupdate");
1506
1507 updateStackNodes();
1508
1509 handleCallsitesWithMultipleTargets();
1510
1511 // Strip off remaining callsite metadata, no longer needed.
1512 for (auto &FuncEntry : FuncToCallsWithMetadata)
1513 for (auto &Call : FuncEntry.second)
1514 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1515}
1516
1517IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1520 isPrevailing)
1521 : Index(Index) {
1522 for (auto &I : Index) {
1523 auto VI = Index.getValueInfo(I);
1524 for (auto &S : VI.getSummaryList()) {
1525 // We should only add the prevailing nodes. Otherwise we may try to clone
1526 // in a weak copy that won't be linked (and may be different than the
1527 // prevailing version).
1528 // We only keep the memprof summary on the prevailing copy now when
1529 // building the combined index, as a space optimization, however don't
1530 // rely on this optimization. The linker doesn't resolve local linkage
1531 // values so don't check whether those are prevailing.
1532 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1533 !isPrevailing(VI.getGUID(), S.get()))
1534 continue;
1535 auto *FS = dyn_cast<FunctionSummary>(S.get());
1536 if (!FS)
1537 continue;
1538 std::vector<CallInfo> CallsWithMetadata;
1539 if (!FS->allocs().empty()) {
1540 for (auto &AN : FS->mutableAllocs()) {
1541 // This can happen because of recursion elimination handling that
1542 // currently exists in ModuleSummaryAnalysis. Skip these for now.
1543 // We still added them to the summary because we need to be able to
1544 // correlate properly in applyImport in the backends.
1545 if (AN.MIBs.empty())
1546 continue;
1547 CallsWithMetadata.push_back({&AN});
1548 auto *AllocNode = addAllocNode({&AN}, FS);
1549 // Pass an empty CallStack to the CallsiteContext (second)
1550 // parameter, since for ThinLTO we already collapsed out the inlined
1551 // stack ids on the allocation call during ModuleSummaryAnalysis.
1553 EmptyContext;
1554 // Now add all of the MIBs and their stack nodes.
1555 for (auto &MIB : AN.MIBs) {
1557 StackContext(&MIB);
1558 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1559 AllocNode, StackContext, EmptyContext, MIB.AllocType);
1560 }
1561 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1562 // Initialize version 0 on the summary alloc node to the current alloc
1563 // type, unless it has both types in which case make it default, so
1564 // that in the case where we aren't able to clone the original version
1565 // always ends up with the default allocation behavior.
1566 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1567 }
1568 }
1569 // For callsite metadata, add to list for this function for later use.
1570 if (!FS->callsites().empty())
1571 for (auto &SN : FS->mutableCallsites())
1572 CallsWithMetadata.push_back({&SN});
1573
1574 if (!CallsWithMetadata.empty())
1575 FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata});
1576
1577 if (!FS->allocs().empty() || !FS->callsites().empty())
1578 FSToVIMap[FS] = VI;
1579 }
1580 }
1581
1582 if (DumpCCG) {
1583 dbgs() << "CCG before updating call stack chains:\n";
1584 dbgs() << *this;
1585 }
1586
1587 if (ExportToDot)
1588 exportToDot("prestackupdate");
1589
1590 updateStackNodes();
1591
1592 handleCallsitesWithMultipleTargets();
1593}
1594
1595template <typename DerivedCCG, typename FuncTy, typename CallTy>
1596void CallsiteContextGraph<DerivedCCG, FuncTy,
1597 CallTy>::handleCallsitesWithMultipleTargets() {
1598 // Look for and workaround callsites that call multiple functions.
1599 // This can happen for indirect calls, which needs better handling, and in
1600 // more rare cases (e.g. macro expansion).
1601 // TODO: To fix this for indirect calls we will want to perform speculative
1602 // devirtualization using either the normal PGO info with ICP, or using the
1603 // information in the profiled MemProf contexts. We can do this prior to
1604 // this transformation for regular LTO, and for ThinLTO we can simulate that
1605 // effect in the summary and perform the actual speculative devirtualization
1606 // while cloning in the ThinLTO backend.
1607 for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1608 Entry != NonAllocationCallToContextNodeMap.end();) {
1609 auto *Node = Entry->second;
1610 assert(Node->Clones.empty());
1611 // Check all node callees and see if in the same function.
1612 bool Removed = false;
1613 auto Call = Node->Call.call();
1614 for (auto &Edge : Node->CalleeEdges) {
1615 if (!Edge->Callee->hasCall())
1616 continue;
1617 assert(NodeToCallingFunc.count(Edge->Callee));
1618 // Check if the called function matches that of the callee node.
1619 if (calleeMatchesFunc(Call, NodeToCallingFunc[Edge->Callee]))
1620 continue;
1621 // Work around by setting Node to have a null call, so it gets
1622 // skipped during cloning. Otherwise assignFunctions will assert
1623 // because its data structures are not designed to handle this case.
1624 Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1625 Node->setCall(CallInfo());
1626 Removed = true;
1627 break;
1628 }
1629 if (!Removed)
1630 Entry++;
1631 }
1632}
1633
1634uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1635 // In the Module (IR) case this is already the Id.
1636 return IdOrIndex;
1637}
1638
1639uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1640 // In the Index case this is an index into the stack id list in the summary
1641 // index, convert it to an Id.
1642 return Index.getStackIdAtIndex(IdOrIndex);
1643}
1644
1645bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
1646 const Function *Func) {
1647 auto *CB = dyn_cast<CallBase>(Call);
1648 if (!CB->getCalledOperand())
1649 return false;
1650 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1651 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
1652 if (CalleeFunc == Func)
1653 return true;
1654 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
1655 return Alias && Alias->getAliasee() == Func;
1656}
1657
1658bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call,
1659 const FunctionSummary *Func) {
1661 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
1662 // If there is no summary list then this is a call to an externally defined
1663 // symbol.
1664 AliasSummary *Alias =
1665 Callee.getSummaryList().empty()
1666 ? nullptr
1667 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
1668 assert(FSToVIMap.count(Func));
1669 return Callee == FSToVIMap[Func] ||
1670 // If callee is an alias, check the aliasee, since only function
1671 // summary base objects will contain the stack node summaries and thus
1672 // get a context node.
1673 (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]);
1674}
1675
1676static std::string getAllocTypeString(uint8_t AllocTypes) {
1677 if (!AllocTypes)
1678 return "None";
1679 std::string Str;
1680 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1681 Str += "NotCold";
1682 if (AllocTypes & (uint8_t)AllocationType::Cold)
1683 Str += "Cold";
1684 return Str;
1685}
1686
1687template <typename DerivedCCG, typename FuncTy, typename CallTy>
1688void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
1689 const {
1690 print(dbgs());
1691 dbgs() << "\n";
1692}
1693
1694template <typename DerivedCCG, typename FuncTy, typename CallTy>
1695void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
1696 raw_ostream &OS) const {
1697 OS << "Node " << this << "\n";
1698 OS << "\t";
1699 printCall(OS);
1700 if (Recursive)
1701 OS << " (recursive)";
1702 OS << "\n";
1703 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
1704 OS << "\tContextIds:";
1705 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1706 std::sort(SortedIds.begin(), SortedIds.end());
1707 for (auto Id : SortedIds)
1708 OS << " " << Id;
1709 OS << "\n";
1710 OS << "\tCalleeEdges:\n";
1711 for (auto &Edge : CalleeEdges)
1712 OS << "\t\t" << *Edge << "\n";
1713 OS << "\tCallerEdges:\n";
1714 for (auto &Edge : CallerEdges)
1715 OS << "\t\t" << *Edge << "\n";
1716 if (!Clones.empty()) {
1717 OS << "\tClones: ";
1718 FieldSeparator FS;
1719 for (auto *Clone : Clones)
1720 OS << FS << Clone;
1721 OS << "\n";
1722 } else if (CloneOf) {
1723 OS << "\tClone of " << CloneOf << "\n";
1724 }
1725}
1726
1727template <typename DerivedCCG, typename FuncTy, typename CallTy>
1728void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
1729 const {
1730 print(dbgs());
1731 dbgs() << "\n";
1732}
1733
1734template <typename DerivedCCG, typename FuncTy, typename CallTy>
1735void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
1736 raw_ostream &OS) const {
1737 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
1738 << " AllocTypes: " << getAllocTypeString(AllocTypes);
1739 OS << " ContextIds:";
1740 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1741 std::sort(SortedIds.begin(), SortedIds.end());
1742 for (auto Id : SortedIds)
1743 OS << " " << Id;
1744}
1745
1746template <typename DerivedCCG, typename FuncTy, typename CallTy>
1747void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
1748 print(dbgs());
1749}
1750
1751template <typename DerivedCCG, typename FuncTy, typename CallTy>
1752void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
1753 raw_ostream &OS) const {
1754 OS << "Callsite Context Graph:\n";
1755 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1756 for (const auto Node : nodes<GraphType>(this)) {
1757 if (Node->isRemoved())
1758 continue;
1759 Node->print(OS);
1760 OS << "\n";
1761 }
1762}
1763
1764template <typename DerivedCCG, typename FuncTy, typename CallTy>
1765static void checkEdge(
1766 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1767 // Confirm that alloc type is not None and that we have at least one context
1768 // id.
1769 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1770 assert(!Edge->ContextIds.empty());
1771}
1772
1773template <typename DerivedCCG, typename FuncTy, typename CallTy>
1774static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1775 bool CheckEdges = true) {
1776 if (Node->isRemoved())
1777 return;
1778 // Node's context ids should be the union of both its callee and caller edge
1779 // context ids.
1780 if (Node->CallerEdges.size()) {
1781 auto EI = Node->CallerEdges.begin();
1782 auto &FirstEdge = *EI;
1783 EI++;
1784 DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
1785 for (; EI != Node->CallerEdges.end(); EI++) {
1786 const auto &Edge = *EI;
1787 if (CheckEdges)
1788 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1789 set_union(CallerEdgeContextIds, Edge->ContextIds);
1790 }
1791 // Node can have more context ids than callers if some contexts terminate at
1792 // node and some are longer.
1793 assert(Node->ContextIds == CallerEdgeContextIds ||
1794 set_is_subset(CallerEdgeContextIds, Node->ContextIds));
1795 }
1796 if (Node->CalleeEdges.size()) {
1797 auto EI = Node->CalleeEdges.begin();
1798 auto &FirstEdge = *EI;
1799 EI++;
1800 DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
1801 for (; EI != Node->CalleeEdges.end(); EI++) {
1802 const auto &Edge = *EI;
1803 if (CheckEdges)
1804 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1805 set_union(CalleeEdgeContextIds, Edge->ContextIds);
1806 }
1807 assert(Node->ContextIds == CalleeEdgeContextIds);
1808 }
1809}
1810
1811template <typename DerivedCCG, typename FuncTy, typename CallTy>
1812void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
1813 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1814 for (const auto Node : nodes<GraphType>(this)) {
1815 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
1816 for (auto &Edge : Node->CallerEdges)
1817 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1818 }
1819}
1820
1821template <typename DerivedCCG, typename FuncTy, typename CallTy>
1822struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
1823 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1824 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
1825
1826 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
1827 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
1828
1831 decltype(&getNode)>;
1832
1834 return nodes_iterator(G->NodeOwner.begin(), &getNode);
1835 }
1836
1838 return nodes_iterator(G->NodeOwner.end(), &getNode);
1839 }
1840
1842 return G->NodeOwner.begin()->get();
1843 }
1844
1845 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
1846 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
1848 return P->Callee;
1849 }
1850
1852 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
1853 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
1854 decltype(&GetCallee)>;
1855
1857 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
1858 }
1859
1861 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
1862 }
1863};
1864
1865template <typename DerivedCCG, typename FuncTy, typename CallTy>
1866struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
1867 : public DefaultDOTGraphTraits {
1868 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
1869
1870 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1872 using NodeRef = typename GTraits::NodeRef;
1873 using ChildIteratorType = typename GTraits::ChildIteratorType;
1874
1875 static std::string getNodeLabel(NodeRef Node, GraphType G) {
1876 std::string LabelString =
1877 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
1878 Twine(Node->OrigStackOrAllocId))
1879 .str();
1880 LabelString += "\n";
1881 if (Node->hasCall()) {
1882 auto Func = G->NodeToCallingFunc.find(Node);
1883 assert(Func != G->NodeToCallingFunc.end());
1884 LabelString +=
1885 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
1886 } else {
1887 LabelString += "null call";
1888 if (Node->Recursive)
1889 LabelString += " (recursive)";
1890 else
1891 LabelString += " (external)";
1892 }
1893 return LabelString;
1894 }
1895
1896 static std::string getNodeAttributes(NodeRef Node, GraphType) {
1897 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
1898 getContextIds(Node->ContextIds) + "\"")
1899 .str();
1901 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
1902 AttributeString += ",style=\"filled\"";
1903 if (Node->CloneOf) {
1904 AttributeString += ",color=\"blue\"";
1905 AttributeString += ",style=\"filled,bold,dashed\"";
1906 } else
1907 AttributeString += ",style=\"filled\"";
1908 return AttributeString;
1909 }
1910
1911 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
1912 GraphType) {
1913 auto &Edge = *(ChildIter.getCurrent());
1914 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
1915 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
1916 .str();
1917 }
1918
1919 // Since the NodeOwners list includes nodes that are no longer connected to
1920 // the graph, skip them here.
1921 static bool isNodeHidden(NodeRef Node, GraphType) {
1922 return Node->isRemoved();
1923 }
1924
1925private:
1926 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
1927 std::string IdString = "ContextIds:";
1928 if (ContextIds.size() < 100) {
1929 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1930 std::sort(SortedIds.begin(), SortedIds.end());
1931 for (auto Id : SortedIds)
1932 IdString += (" " + Twine(Id)).str();
1933 } else {
1934 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
1935 }
1936 return IdString;
1937 }
1938
1939 static std::string getColor(uint8_t AllocTypes) {
1940 if (AllocTypes == (uint8_t)AllocationType::NotCold)
1941 // Color "brown1" actually looks like a lighter red.
1942 return "brown1";
1943 if (AllocTypes == (uint8_t)AllocationType::Cold)
1944 return "cyan";
1945 if (AllocTypes ==
1946 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
1947 // Lighter purple.
1948 return "mediumorchid1";
1949 return "gray";
1950 }
1951
1952 static std::string getNodeId(NodeRef Node) {
1953 std::stringstream SStream;
1954 SStream << std::hex << "N0x" << (unsigned long long)Node;
1955 std::string Result = SStream.str();
1956 return Result;
1957 }
1958};
1959
1960template <typename DerivedCCG, typename FuncTy, typename CallTy>
1961void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
1962 std::string Label) const {
1963 WriteGraph(this, "", false, Label,
1964 DotFilePathPrefix + "ccg." + Label + ".dot");
1965}
1966
1967template <typename DerivedCCG, typename FuncTy, typename CallTy>
1968typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1969CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
1970 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) {
1971 ContextNode *Node = Edge->Callee;
1972 NodeOwner.push_back(
1973 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
1974 ContextNode *Clone = NodeOwner.back().get();
1975 Node->addClone(Clone);
1976 assert(NodeToCallingFunc.count(Node));
1977 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
1978 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true);
1979 return Clone;
1980}
1981
1982template <typename DerivedCCG, typename FuncTy, typename CallTy>
1983void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1984 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
1985 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
1986 bool NewClone) {
1987 // NewCallee and Edge's current callee must be clones of the same original
1988 // node (Edge's current callee may be the original node too).
1989 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
1990 auto &EdgeContextIds = Edge->getContextIds();
1991 ContextNode *OldCallee = Edge->Callee;
1992 if (CallerEdgeI)
1993 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
1994 else
1995 OldCallee->eraseCallerEdge(Edge.get());
1996 Edge->Callee = NewCallee;
1997 NewCallee->CallerEdges.push_back(Edge);
1998 // Don't need to update Edge's context ids since we are simply reconnecting
1999 // it.
2000 set_subtract(OldCallee->ContextIds, EdgeContextIds);
2001 NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end());
2002 NewCallee->AllocTypes |= Edge->AllocTypes;
2003 OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
2004 // OldCallee alloc type should be None iff its context id set is now empty.
2005 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2006 OldCallee->ContextIds.empty());
2007 // Now walk the old callee node's callee edges and move Edge's context ids
2008 // over to the corresponding edge into the clone (which is created here if
2009 // this is a newly created clone).
2010 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2011 // The context ids moving to the new callee are the subset of this edge's
2012 // context ids and the context ids on the caller edge being moved.
2013 DenseSet<uint32_t> EdgeContextIdsToMove =
2014 set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds);
2015 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2016 OldCalleeEdge->AllocTypes =
2017 computeAllocType(OldCalleeEdge->getContextIds());
2018 if (!NewClone) {
2019 // Update context ids / alloc type on corresponding edge to NewCallee.
2020 // There is a chance this may not exist if we are reusing an existing
2021 // clone, specifically during function assignment, where we would have
2022 // removed none type edges after creating the clone. If we can't find
2023 // a corresponding edge there, fall through to the cloning below.
2024 if (auto *NewCalleeEdge =
2025 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2026 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2027 EdgeContextIdsToMove.end());
2028 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2029 continue;
2030 }
2031 }
2032 auto NewEdge = std::make_shared<ContextEdge>(
2033 OldCalleeEdge->Callee, NewCallee,
2034 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2035 NewCallee->CalleeEdges.push_back(NewEdge);
2036 NewEdge->Callee->CallerEdges.push_back(NewEdge);
2037 }
2038 if (VerifyCCG) {
2039 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2040 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2041 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2042 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2043 /*CheckEdges=*/false);
2044 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2045 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2046 /*CheckEdges=*/false);
2047 }
2048}
2049
2050template <typename DerivedCCG, typename FuncTy, typename CallTy>
2051void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2053 for (auto &Entry : AllocationCallToContextNodeMap)
2054 identifyClones(Entry.second, Visited);
2055}
2056
2057// helper function to check an AllocType is cold or notcold or both.
2059 return (AllocType == (uint8_t)AllocationType::Cold) ||
2060 (AllocType == (uint8_t)AllocationType::NotCold) ||
2061 (AllocType ==
2062 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2063}
2064
2065template <typename DerivedCCG, typename FuncTy, typename CallTy>
2066void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2067 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2068 if (VerifyNodes)
2069 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2070 assert(!Node->CloneOf);
2071
2072 // If Node as a null call, then either it wasn't found in the module (regular
2073 // LTO) or summary index (ThinLTO), or there were other conditions blocking
2074 // cloning (e.g. recursion, calls multiple targets, etc).
2075 // Do this here so that we don't try to recursively clone callers below, which
2076 // isn't useful at least for this node.
2077 if (!Node->hasCall())
2078 return;
2079
2080#ifndef NDEBUG
2081 auto Insert =
2082#endif
2083 Visited.insert(Node);
2084 // We should not have visited this node yet.
2085 assert(Insert.second);
2086 // The recursive call to identifyClones may delete the current edge from the
2087 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2088 // in an iterator and having recursive call erase from it. Other edges may
2089 // also get removed during the recursion, which will have null Callee and
2090 // Caller pointers (and are deleted later), so we skip those below.
2091 {
2092 auto CallerEdges = Node->CallerEdges;
2093 for (auto &Edge : CallerEdges) {
2094 // Skip any that have been removed by an earlier recursive call.
2095 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2096 assert(!llvm::count(Node->CallerEdges, Edge));
2097 continue;
2098 }
2099 // Ignore any caller we previously visited via another edge.
2100 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2101 identifyClones(Edge->Caller, Visited);
2102 }
2103 }
2104 }
2105
2106 // Check if we reached an unambiguous call or have have only a single caller.
2107 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2108 return;
2109
2110 // We need to clone.
2111
2112 // Try to keep the original version as alloc type NotCold. This will make
2113 // cases with indirect calls or any other situation with an unknown call to
2114 // the original function get the default behavior. We do this by sorting the
2115 // CallerEdges of the Node we will clone by alloc type.
2116 //
2117 // Give NotCold edge the lowest sort priority so those edges are at the end of
2118 // the caller edges vector, and stay on the original version (since the below
2119 // code clones greedily until it finds all remaining edges have the same type
2120 // and leaves the remaining ones on the original Node).
2121 //
2122 // We shouldn't actually have any None type edges, so the sorting priority for
2123 // that is arbitrary, and we assert in that case below.
2124 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2125 /*Cold*/ 1,
2126 /*NotColdCold*/ 2};
2127 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2128 [&](const std::shared_ptr<ContextEdge> &A,
2129 const std::shared_ptr<ContextEdge> &B) {
2130 assert(checkColdOrNotCold(A->AllocTypes) &&
2131 checkColdOrNotCold(B->AllocTypes));
2132
2133 if (A->AllocTypes == B->AllocTypes)
2134 // Use the first context id for each edge as a
2135 // tie-breaker.
2136 return *A->ContextIds.begin() < *B->ContextIds.begin();
2137 return AllocTypeCloningPriority[A->AllocTypes] <
2138 AllocTypeCloningPriority[B->AllocTypes];
2139 });
2140
2141 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2142
2143 // Iterate until we find no more opportunities for disambiguating the alloc
2144 // types via cloning. In most cases this loop will terminate once the Node
2145 // has a single allocation type, in which case no more cloning is needed.
2146 // We need to be able to remove Edge from CallerEdges, so need to adjust
2147 // iterator inside the loop.
2148 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2149 auto CallerEdge = *EI;
2150
2151 // See if cloning the prior caller edge left this node with a single alloc
2152 // type or a single caller. In that case no more cloning of Node is needed.
2153 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2154 break;
2155
2156 // Compute the node callee edge alloc types corresponding to the context ids
2157 // for this caller edge.
2158 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2159 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2160 for (auto &CalleeEdge : Node->CalleeEdges)
2161 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2162 CalleeEdge->getContextIds(), CallerEdge->getContextIds()));
2163
2164 // Don't clone if doing so will not disambiguate any alloc types amongst
2165 // caller edges (including the callee edges that would be cloned).
2166 // Otherwise we will simply move all edges to the clone.
2167 //
2168 // First check if by cloning we will disambiguate the caller allocation
2169 // type from node's allocation type. Query allocTypeToUse so that we don't
2170 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2171 // neither of these should be None type.
2172 //
2173 // Then check if by cloning node at least one of the callee edges will be
2174 // disambiguated by splitting out different context ids.
2175 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2176 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2177 if (allocTypeToUse(CallerEdge->AllocTypes) ==
2178 allocTypeToUse(Node->AllocTypes) &&
2179 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2180 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2181 ++EI;
2182 continue;
2183 }
2184
2185 // First see if we can use an existing clone. Check each clone and its
2186 // callee edges for matching alloc types.
2187 ContextNode *Clone = nullptr;
2188 for (auto *CurClone : Node->Clones) {
2189 if (allocTypeToUse(CurClone->AllocTypes) !=
2190 allocTypeToUse(CallerEdge->AllocTypes))
2191 continue;
2192
2193 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2194 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2195 continue;
2196 Clone = CurClone;
2197 break;
2198 }
2199
2200 // The edge iterator is adjusted when we move the CallerEdge to the clone.
2201 if (Clone)
2202 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI);
2203 else
2204 Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI);
2205
2206 assert(EI == Node->CallerEdges.end() ||
2207 Node->AllocTypes != (uint8_t)AllocationType::None);
2208 // Sanity check that no alloc types on clone or its edges are None.
2209 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2211 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2212 return E->AllocTypes == (uint8_t)AllocationType::None;
2213 }));
2214 }
2215
2216 // Cloning may have resulted in some cloned callee edges with type None,
2217 // because they aren't carrying any contexts. Remove those edges.
2218 for (auto *Clone : Node->Clones) {
2219 removeNoneTypeCalleeEdges(Clone);
2220 if (VerifyNodes)
2221 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2222 }
2223 // We should still have some context ids on the original Node.
2224 assert(!Node->ContextIds.empty());
2225
2226 // Remove any callee edges that ended up with alloc type None after creating
2227 // clones and updating callee edges.
2228 removeNoneTypeCalleeEdges(Node);
2229
2230 // Sanity check that no alloc types on node or edges are None.
2231 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2232 assert(llvm::none_of(Node->CalleeEdges,
2233 [&](const std::shared_ptr<ContextEdge> &E) {
2234 return E->AllocTypes == (uint8_t)AllocationType::None;
2235 }));
2236 assert(llvm::none_of(Node->CallerEdges,
2237 [&](const std::shared_ptr<ContextEdge> &E) {
2238 return E->AllocTypes == (uint8_t)AllocationType::None;
2239 }));
2240
2241 if (VerifyNodes)
2242 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2243}
2244
2245void ModuleCallsiteContextGraph::updateAllocationCall(
2247 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2248 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2249 "memprof", AllocTypeString);
2250 cast<CallBase>(Call.call())->addFnAttr(A);
2251 OREGetter(Call.call()->getFunction())
2252 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2253 << ore::NV("AllocationCall", Call.call()) << " in clone "
2254 << ore::NV("Caller", Call.call()->getFunction())
2255 << " marked with memprof allocation attribute "
2256 << ore::NV("Attribute", AllocTypeString));
2257}
2258
2259void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2261 auto *AI = Call.call().dyn_cast<AllocInfo *>();
2262 assert(AI);
2263 assert(AI->Versions.size() > Call.cloneNo());
2264 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2265}
2266
2267void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2268 FuncInfo CalleeFunc) {
2269 if (CalleeFunc.cloneNo() > 0)
2270 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2271 OREGetter(CallerCall.call()->getFunction())
2272 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2273 << ore::NV("Call", CallerCall.call()) << " in clone "
2274 << ore::NV("Caller", CallerCall.call()->getFunction())
2275 << " assigned to call function clone "
2276 << ore::NV("Callee", CalleeFunc.func()));
2277}
2278
2279void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2280 FuncInfo CalleeFunc) {
2281 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2282 assert(CI &&
2283 "Caller cannot be an allocation which should not have profiled calls");
2284 assert(CI->Clones.size() > CallerCall.cloneNo());
2285 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2286}
2287
2288CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2289 Instruction *>::FuncInfo
2290ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2291 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2292 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2293 // Use existing LLVM facilities for cloning and obtaining Call in clone
2294 ValueToValueMapTy VMap;
2295 auto *NewFunc = CloneFunction(Func.func(), VMap);
2296 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2297 assert(!Func.func()->getParent()->getFunction(Name));
2298 NewFunc->setName(Name);
2299 for (auto &Inst : CallsWithMetadataInFunc) {
2300 // This map always has the initial version in it.
2301 assert(Inst.cloneNo() == 0);
2302 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2303 }
2304 OREGetter(Func.func())
2305 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2306 << "created clone " << ore::NV("NewFunction", NewFunc));
2307 return {NewFunc, CloneNo};
2308}
2309
2310CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2311 IndexCall>::FuncInfo
2312IndexCallsiteContextGraph::cloneFunctionForCallsite(
2313 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2314 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2315 // Check how many clones we have of Call (and therefore function).
2316 // The next clone number is the current size of versions array.
2317 // Confirm this matches the CloneNo provided by the caller, which is based on
2318 // the number of function clones we have.
2319 assert(CloneNo ==
2320 (Call.call().is<AllocInfo *>()
2321 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2322 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2323 // Walk all the instructions in this function. Create a new version for
2324 // each (by adding an entry to the Versions/Clones summary array), and copy
2325 // over the version being called for the function clone being cloned here.
2326 // Additionally, add an entry to the CallMap for the new function clone,
2327 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2328 // to the new call clone.
2329 for (auto &Inst : CallsWithMetadataInFunc) {
2330 // This map always has the initial version in it.
2331 assert(Inst.cloneNo() == 0);
2332 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2333 assert(AI->Versions.size() == CloneNo);
2334 // We assign the allocation type later (in updateAllocationCall), just add
2335 // an entry for it here.
2336 AI->Versions.push_back(0);
2337 } else {
2338 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2339 assert(CI && CI->Clones.size() == CloneNo);
2340 // We assign the clone number later (in updateCall), just add an entry for
2341 // it here.
2342 CI->Clones.push_back(0);
2343 }
2344 CallMap[Inst] = {Inst.call(), CloneNo};
2345 }
2346 return {Func.func(), CloneNo};
2347}
2348
2349// This method assigns cloned callsites to functions, cloning the functions as
2350// needed. The assignment is greedy and proceeds roughly as follows:
2351//
2352// For each function Func:
2353// For each call with graph Node having clones:
2354// Initialize ClonesWorklist to Node and its clones
2355// Initialize NodeCloneCount to 0
2356// While ClonesWorklist is not empty:
2357// Clone = pop front ClonesWorklist
2358// NodeCloneCount++
2359// If Func has been cloned less than NodeCloneCount times:
2360// If NodeCloneCount is 1:
2361// Assign Clone to original Func
2362// Continue
2363// Create a new function clone
2364// If other callers not assigned to call a function clone yet:
2365// Assign them to call new function clone
2366// Continue
2367// Assign any other caller calling the cloned version to new clone
2368//
2369// For each caller of Clone:
2370// If caller is assigned to call a specific function clone:
2371// If we cannot assign Clone to that function clone:
2372// Create new callsite Clone NewClone
2373// Add NewClone to ClonesWorklist
2374// Continue
2375// Assign Clone to existing caller's called function clone
2376// Else:
2377// If Clone not already assigned to a function clone:
2378// Assign to first function clone without assignment
2379// Assign caller to selected function clone
2380template <typename DerivedCCG, typename FuncTy, typename CallTy>
2381bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2382 bool Changed = false;
2383
2384 // Keep track of the assignment of nodes (callsites) to function clones they
2385 // call.
2386 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2387
2388 // Update caller node to call function version CalleeFunc, by recording the
2389 // assignment in CallsiteToCalleeFuncCloneMap.
2390 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2391 const FuncInfo &CalleeFunc) {
2392 assert(Caller->hasCall());
2393 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2394 };
2395
2396 // Walk all functions for which we saw calls with memprof metadata, and handle
2397 // cloning for each of its calls.
2398 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2399 FuncInfo OrigFunc(Func);
2400 // Map from each clone of OrigFunc to a map of remappings of each call of
2401 // interest (from original uncloned call to the corresponding cloned call in
2402 // that function clone).
2403 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2404 for (auto &Call : CallsWithMetadata) {
2405 ContextNode *Node = getNodeForInst(Call);
2406 // Skip call if we do not have a node for it (all uses of its stack ids
2407 // were either on inlined chains or pruned from the MIBs), or if we did
2408 // not create any clones for it.
2409 if (!Node || Node->Clones.empty())
2410 continue;
2411 assert(Node->hasCall() &&
2412 "Not having a call should have prevented cloning");
2413
2414 // Track the assignment of function clones to clones of the current
2415 // callsite Node being handled.
2416 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2417
2418 // Assign callsite version CallsiteClone to function version FuncClone,
2419 // and also assign (possibly cloned) Call to CallsiteClone.
2420 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2421 CallInfo &Call,
2422 ContextNode *CallsiteClone,
2423 bool IsAlloc) {
2424 // Record the clone of callsite node assigned to this function clone.
2425 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2426
2427 assert(FuncClonesToCallMap.count(FuncClone));
2428 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2429 CallInfo CallClone(Call);
2430 if (CallMap.count(Call))
2431 CallClone = CallMap[Call];
2432 CallsiteClone->setCall(CallClone);
2433 };
2434
2435 // Keep track of the clones of callsite Node that need to be assigned to
2436 // function clones. This list may be expanded in the loop body below if we
2437 // find additional cloning is required.
2438 std::deque<ContextNode *> ClonesWorklist;
2439 // Ignore original Node if we moved all of its contexts to clones.
2440 if (!Node->ContextIds.empty())
2441 ClonesWorklist.push_back(Node);
2442 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2443 Node->Clones.end());
2444
2445 // Now walk through all of the clones of this callsite Node that we need,
2446 // and determine the assignment to a corresponding clone of the current
2447 // function (creating new function clones as needed).
2448 unsigned NodeCloneCount = 0;
2449 while (!ClonesWorklist.empty()) {
2450 ContextNode *Clone = ClonesWorklist.front();
2451 ClonesWorklist.pop_front();
2452 NodeCloneCount++;
2453 if (VerifyNodes)
2454 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2455
2456 // Need to create a new function clone if we have more callsite clones
2457 // than existing function clones, which would have been assigned to an
2458 // earlier clone in the list (we assign callsite clones to function
2459 // clones greedily).
2460 if (FuncClonesToCallMap.size() < NodeCloneCount) {
2461 // If this is the first callsite copy, assign to original function.
2462 if (NodeCloneCount == 1) {
2463 // Since FuncClonesToCallMap is empty in this case, no clones have
2464 // been created for this function yet, and no callers should have
2465 // been assigned a function clone for this callee node yet.
2467 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2468 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2469 }));
2470 // Initialize with empty call map, assign Clone to original function
2471 // and its callers, and skip to the next clone.
2472 FuncClonesToCallMap[OrigFunc] = {};
2473 AssignCallsiteCloneToFuncClone(
2474 OrigFunc, Call, Clone,
2475 AllocationCallToContextNodeMap.count(Call));
2476 for (auto &CE : Clone->CallerEdges) {
2477 // Ignore any caller that does not have a recorded callsite Call.
2478 if (!CE->Caller->hasCall())
2479 continue;
2480 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2481 }
2482 continue;
2483 }
2484
2485 // First locate which copy of OrigFunc to clone again. If a caller
2486 // of this callsite clone was already assigned to call a particular
2487 // function clone, we need to redirect all of those callers to the
2488 // new function clone, and update their other callees within this
2489 // function.
2490 FuncInfo PreviousAssignedFuncClone;
2491 auto EI = llvm::find_if(
2492 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2493 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2494 });
2495 bool CallerAssignedToCloneOfFunc = false;
2496 if (EI != Clone->CallerEdges.end()) {
2497 const std::shared_ptr<ContextEdge> &Edge = *EI;
2498 PreviousAssignedFuncClone =
2499 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2500 CallerAssignedToCloneOfFunc = true;
2501 }
2502
2503 // Clone function and save it along with the CallInfo map created
2504 // during cloning in the FuncClonesToCallMap.
2505 std::map<CallInfo, CallInfo> NewCallMap;
2506 unsigned CloneNo = FuncClonesToCallMap.size();
2507 assert(CloneNo > 0 && "Clone 0 is the original function, which "
2508 "should already exist in the map");
2509 FuncInfo NewFuncClone = cloneFunctionForCallsite(
2510 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
2511 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
2512 FunctionClonesAnalysis++;
2513 Changed = true;
2514
2515 // If no caller callsites were already assigned to a clone of this
2516 // function, we can simply assign this clone to the new func clone
2517 // and update all callers to it, then skip to the next clone.
2518 if (!CallerAssignedToCloneOfFunc) {
2519 AssignCallsiteCloneToFuncClone(
2520 NewFuncClone, Call, Clone,
2521 AllocationCallToContextNodeMap.count(Call));
2522 for (auto &CE : Clone->CallerEdges) {
2523 // Ignore any caller that does not have a recorded callsite Call.
2524 if (!CE->Caller->hasCall())
2525 continue;
2526 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2527 }
2528 continue;
2529 }
2530
2531 // We may need to do additional node cloning in this case.
2532 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
2533 // that were previously assigned to call PreviousAssignedFuncClone,
2534 // to record that they now call NewFuncClone.
2535 for (auto CE : Clone->CallerEdges) {
2536 // Ignore any caller that does not have a recorded callsite Call.
2537 if (!CE->Caller->hasCall())
2538 continue;
2539
2540 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
2541 // We subsequently fall through to later handling that
2542 // will perform any additional cloning required for
2543 // callers that were calling other function clones.
2544 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
2545 PreviousAssignedFuncClone)
2546 continue;
2547
2548 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2549
2550 // If we are cloning a function that was already assigned to some
2551 // callers, then essentially we are creating new callsite clones
2552 // of the other callsites in that function that are reached by those
2553 // callers. Clone the other callees of the current callsite's caller
2554 // that were already assigned to PreviousAssignedFuncClone
2555 // accordingly. This is important since we subsequently update the
2556 // calls from the nodes in the graph and their assignments to callee
2557 // functions recorded in CallsiteToCalleeFuncCloneMap.
2558 for (auto CalleeEdge : CE->Caller->CalleeEdges) {
2559 // Skip any that have been removed on an earlier iteration when
2560 // cleaning up newly None type callee edges.
2561 if (!CalleeEdge)
2562 continue;
2563 ContextNode *Callee = CalleeEdge->Callee;
2564 // Skip the current callsite, we are looking for other
2565 // callsites Caller calls, as well as any that does not have a
2566 // recorded callsite Call.
2567 if (Callee == Clone || !Callee->hasCall())
2568 continue;
2569 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
2570 removeNoneTypeCalleeEdges(NewClone);
2571 // Moving the edge may have resulted in some none type
2572 // callee edges on the original Callee.
2573 removeNoneTypeCalleeEdges(Callee);
2574 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2575 // If the Callee node was already assigned to call a specific
2576 // function version, make sure its new clone is assigned to call
2577 // that same function clone.
2578 if (CallsiteToCalleeFuncCloneMap.count(Callee))
2579 RecordCalleeFuncOfCallsite(
2580 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
2581 // Update NewClone with the new Call clone of this callsite's Call
2582 // created for the new function clone created earlier.
2583 // Recall that we have already ensured when building the graph
2584 // that each caller can only call callsites within the same
2585 // function, so we are guaranteed that Callee Call is in the
2586 // current OrigFunc.
2587 // CallMap is set up as indexed by original Call at clone 0.
2588 CallInfo OrigCall(Callee->getOrigNode()->Call);
2589 OrigCall.setCloneNo(0);
2590 std::map<CallInfo, CallInfo> &CallMap =
2591 FuncClonesToCallMap[NewFuncClone];
2592 assert(CallMap.count(OrigCall));
2593 CallInfo NewCall(CallMap[OrigCall]);
2594 assert(NewCall);
2595 NewClone->setCall(NewCall);
2596 }
2597 }
2598 // Fall through to handling below to perform the recording of the
2599 // function for this callsite clone. This enables handling of cases
2600 // where the callers were assigned to different clones of a function.
2601 }
2602
2603 // See if we can use existing function clone. Walk through
2604 // all caller edges to see if any have already been assigned to
2605 // a clone of this callsite's function. If we can use it, do so. If not,
2606 // because that function clone is already assigned to a different clone
2607 // of this callsite, then we need to clone again.
2608 // Basically, this checking is needed to handle the case where different
2609 // caller functions/callsites may need versions of this function
2610 // containing different mixes of callsite clones across the different
2611 // callsites within the function. If that happens, we need to create
2612 // additional function clones to handle the various combinations.
2613 //
2614 // Keep track of any new clones of this callsite created by the
2615 // following loop, as well as any existing clone that we decided to
2616 // assign this clone to.
2617 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
2618 FuncInfo FuncCloneAssignedToCurCallsiteClone;
2619 // We need to be able to remove Edge from CallerEdges, so need to adjust
2620 // iterator in the loop.
2621 for (auto EI = Clone->CallerEdges.begin();
2622 EI != Clone->CallerEdges.end();) {
2623 auto Edge = *EI;
2624 // Ignore any caller that does not have a recorded callsite Call.
2625 if (!Edge->Caller->hasCall()) {
2626 EI++;
2627 continue;
2628 }
2629 // If this caller already assigned to call a version of OrigFunc, need
2630 // to ensure we can assign this callsite clone to that function clone.
2631 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
2632 FuncInfo FuncCloneCalledByCaller =
2633 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2634 // First we need to confirm that this function clone is available
2635 // for use by this callsite node clone.
2636 //
2637 // While FuncCloneToCurNodeCloneMap is built only for this Node and
2638 // its callsite clones, one of those callsite clones X could have
2639 // been assigned to the same function clone called by Edge's caller
2640 // - if Edge's caller calls another callsite within Node's original
2641 // function, and that callsite has another caller reaching clone X.
2642 // We need to clone Node again in this case.
2643 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
2644 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
2645 Clone) ||
2646 // Detect when we have multiple callers of this callsite that
2647 // have already been assigned to specific, and different, clones
2648 // of OrigFunc (due to other unrelated callsites in Func they
2649 // reach via call contexts). Is this Clone of callsite Node
2650 // assigned to a different clone of OrigFunc? If so, clone Node
2651 // again.
2652 (FuncCloneAssignedToCurCallsiteClone &&
2653 FuncCloneAssignedToCurCallsiteClone !=
2654 FuncCloneCalledByCaller)) {
2655 // We need to use a different newly created callsite clone, in
2656 // order to assign it to another new function clone on a
2657 // subsequent iteration over the Clones array (adjusted below).
2658 // Note we specifically do not reset the
2659 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
2660 // when this new clone is processed later we know which version of
2661 // the function to copy (so that other callsite clones we have
2662 // assigned to that function clone are properly cloned over). See
2663 // comments in the function cloning handling earlier.
2664
2665 // Check if we already have cloned this callsite again while
2666 // walking through caller edges, for a caller calling the same
2667 // function clone. If so, we can move this edge to that new clone
2668 // rather than creating yet another new clone.
2669 if (FuncCloneToNewCallsiteCloneMap.count(
2670 FuncCloneCalledByCaller)) {
2671 ContextNode *NewClone =
2672 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
2673 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
2674 // Cleanup any none type edges cloned over.
2675 removeNoneTypeCalleeEdges(NewClone);
2676 } else {
2677 // Create a new callsite clone.
2678 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
2679 removeNoneTypeCalleeEdges(NewClone);
2680 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
2681 NewClone;
2682 // Add to list of clones and process later.
2683 ClonesWorklist.push_back(NewClone);
2684 assert(EI == Clone->CallerEdges.end() ||
2685 Clone->AllocTypes != (uint8_t)AllocationType::None);
2686 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2687 }
2688 // Moving the caller edge may have resulted in some none type
2689 // callee edges.
2690 removeNoneTypeCalleeEdges(Clone);
2691 // We will handle the newly created callsite clone in a subsequent
2692 // iteration over this Node's Clones. Continue here since we
2693 // already adjusted iterator EI while moving the edge.
2694 continue;
2695 }
2696
2697 // Otherwise, we can use the function clone already assigned to this
2698 // caller.
2699 if (!FuncCloneAssignedToCurCallsiteClone) {
2700 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
2701 // Assign Clone to FuncCloneCalledByCaller
2702 AssignCallsiteCloneToFuncClone(
2703 FuncCloneCalledByCaller, Call, Clone,
2704 AllocationCallToContextNodeMap.count(Call));
2705 } else
2706 // Don't need to do anything - callsite is already calling this
2707 // function clone.
2708 assert(FuncCloneAssignedToCurCallsiteClone ==
2709 FuncCloneCalledByCaller);
2710
2711 } else {
2712 // We have not already assigned this caller to a version of
2713 // OrigFunc. Do the assignment now.
2714
2715 // First check if we have already assigned this callsite clone to a
2716 // clone of OrigFunc for another caller during this iteration over
2717 // its caller edges.
2718 if (!FuncCloneAssignedToCurCallsiteClone) {
2719 // Find first function in FuncClonesToCallMap without an assigned
2720 // clone of this callsite Node. We should always have one
2721 // available at this point due to the earlier cloning when the
2722 // FuncClonesToCallMap size was smaller than the clone number.
2723 for (auto &CF : FuncClonesToCallMap) {
2724 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
2725 FuncCloneAssignedToCurCallsiteClone = CF.first;
2726 break;
2727 }
2728 }
2729 assert(FuncCloneAssignedToCurCallsiteClone);
2730 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
2731 AssignCallsiteCloneToFuncClone(
2732 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
2733 AllocationCallToContextNodeMap.count(Call));
2734 } else
2735 assert(FuncCloneToCurNodeCloneMap
2736 [FuncCloneAssignedToCurCallsiteClone] == Clone);
2737 // Update callers to record function version called.
2738 RecordCalleeFuncOfCallsite(Edge->Caller,
2739 FuncCloneAssignedToCurCallsiteClone);
2740 }
2741
2742 EI++;
2743 }
2744 }
2745 if (VerifyCCG) {
2746 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2747 for (const auto &PE : Node->CalleeEdges)
2748 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
2749 for (const auto &CE : Node->CallerEdges)
2750 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
2751 for (auto *Clone : Node->Clones) {
2752 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2753 for (const auto &PE : Clone->CalleeEdges)
2754 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
2755 for (const auto &CE : Clone->CallerEdges)
2756 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
2757 }
2758 }
2759 }
2760 }
2761
2762 auto UpdateCalls = [&](ContextNode *Node,
2764 auto &&UpdateCalls) {
2765 auto Inserted = Visited.insert(Node);
2766 if (!Inserted.second)
2767 return;
2768
2769 for (auto *Clone : Node->Clones)
2770 UpdateCalls(Clone, Visited, UpdateCalls);
2771
2772 for (auto &Edge : Node->CallerEdges)
2773 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
2774
2775 // Skip if either no call to update, or if we ended up with no context ids
2776 // (we moved all edges onto other clones).
2777 if (!Node->hasCall() || Node->ContextIds.empty())
2778 return;
2779
2780 if (Node->IsAllocation) {
2781 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
2782 return;
2783 }
2784
2785 if (!CallsiteToCalleeFuncCloneMap.count(Node))
2786 return;
2787
2788 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
2789 updateCall(Node->Call, CalleeFunc);
2790 };
2791
2792 // Performs DFS traversal starting from allocation nodes to update calls to
2793 // reflect cloning decisions recorded earlier. For regular LTO this will
2794 // update the actual calls in the IR to call the appropriate function clone
2795 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
2796 // are recorded in the summary entries.
2798 for (auto &Entry : AllocationCallToContextNodeMap)
2799 UpdateCalls(Entry.second, Visited, UpdateCalls);
2800
2801 return Changed;
2802}
2803
2805 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
2807 &FuncToAliasMap) {
2808 // The first "clone" is the original copy, we should only call this if we
2809 // needed to create new clones.
2810 assert(NumClones > 1);
2812 VMaps.reserve(NumClones - 1);
2813 FunctionsClonedThinBackend++;
2814 for (unsigned I = 1; I < NumClones; I++) {
2815 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
2816 auto *NewF = CloneFunction(&F, *VMaps.back());
2817 FunctionClonesThinBackend++;
2818 // Strip memprof and callsite metadata from clone as they are no longer
2819 // needed.
2820 for (auto &BB : *NewF) {
2821 for (auto &Inst : BB) {
2822 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
2823 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
2824 }
2825 }
2826 std::string Name = getMemProfFuncName(F.getName(), I);
2827 auto *PrevF = M.getFunction(Name);
2828 if (PrevF) {
2829 // We might have created this when adjusting callsite in another
2830 // function. It should be a declaration.
2831 assert(PrevF->isDeclaration());
2832 NewF->takeName(PrevF);
2833 PrevF->replaceAllUsesWith(NewF);
2834 PrevF->eraseFromParent();
2835 } else
2836 NewF->setName(Name);
2837 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
2838 << "created clone " << ore::NV("NewFunction", NewF));
2839
2840 // Now handle aliases to this function, and clone those as well.
2841 if (!FuncToAliasMap.count(&F))
2842 continue;
2843 for (auto *A : FuncToAliasMap[&F]) {
2844 std::string Name = getMemProfFuncName(A->getName(), I);
2845 auto *PrevA = M.getNamedAlias(Name);
2846 auto *NewA = GlobalAlias::create(A->getValueType(),
2847 A->getType()->getPointerAddressSpace(),
2848 A->getLinkage(), Name, NewF);
2849 NewA->copyAttributesFrom(A);
2850 if (PrevA) {
2851 // We might have created this when adjusting callsite in another
2852 // function. It should be a declaration.
2853 assert(PrevA->isDeclaration());
2854 NewA->takeName(PrevA);
2855 PrevA->replaceAllUsesWith(NewA);
2856 PrevA->eraseFromParent();
2857 }
2858 }
2859 }
2860 return VMaps;
2861}
2862
2863// Locate the summary for F. This is complicated by the fact that it might
2864// have been internalized or promoted.
2866 const ModuleSummaryIndex *ImportSummary) {
2867 // FIXME: Ideally we would retain the original GUID in some fashion on the
2868 // function (e.g. as metadata), but for now do our best to locate the
2869 // summary without that information.
2870 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
2871 if (!TheFnVI)
2872 // See if theFn was internalized, by checking index directly with
2873 // original name (this avoids the name adjustment done by getGUID() for
2874 // internal symbols).
2875 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
2876 if (TheFnVI)
2877 return TheFnVI;
2878 // Now query with the original name before any promotion was performed.
2879 StringRef OrigName =
2881 std::string OrigId = GlobalValue::getGlobalIdentifier(
2882 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
2883 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
2884 if (TheFnVI)
2885 return TheFnVI;
2886 // Could be a promoted local imported from another module. We need to pass
2887 // down more info here to find the original module id. For now, try with
2888 // the OrigName which might have been stored in the OidGuidMap in the
2889 // index. This would not work if there were same-named locals in multiple
2890 // modules, however.
2891 auto OrigGUID =
2892 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
2893 if (OrigGUID)
2894 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
2895 return TheFnVI;
2896}
2897
2898bool MemProfContextDisambiguation::applyImport(Module &M) {
2899 assert(ImportSummary);
2900 bool Changed = false;
2901
2902 auto IsMemProfClone = [](const Function &F) {
2903 return F.getName().contains(MemProfCloneSuffix);
2904 };
2905
2906 // We also need to clone any aliases that reference cloned functions, because
2907 // the modified callsites may invoke via the alias. Keep track of the aliases
2908 // for each function.
2909 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
2910 FuncToAliasMap;
2911 for (auto &A : M.aliases()) {
2912 auto *Aliasee = A.getAliaseeObject();
2913 if (auto *F = dyn_cast<Function>(Aliasee))
2914 FuncToAliasMap[F].insert(&A);
2915 }
2916
2917 for (auto &F : M) {
2918 if (F.isDeclaration() || IsMemProfClone(F))
2919 continue;
2920
2922
2924 bool ClonesCreated = false;
2925 unsigned NumClonesCreated = 0;
2926 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
2927 // We should at least have version 0 which is the original copy.
2928 assert(NumClones > 0);
2929 // If only one copy needed use original.
2930 if (NumClones == 1)
2931 return;
2932 // If we already performed cloning of this function, confirm that the
2933 // requested number of clones matches (the thin link should ensure the
2934 // number of clones for each constituent callsite is consistent within
2935 // each function), before returning.
2936 if (ClonesCreated) {
2937 assert(NumClonesCreated == NumClones);
2938 return;
2939 }
2940 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
2941 // The first "clone" is the original copy, which doesn't have a VMap.
2942 assert(VMaps.size() == NumClones - 1);
2943 Changed = true;
2944 ClonesCreated = true;
2945 NumClonesCreated = NumClones;
2946 };
2947
2948 // Locate the summary for F.
2949 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
2950 // If not found, this could be an imported local (see comment in
2951 // findValueInfoForFunc). Skip for now as it will be cloned in its original
2952 // module (where it would have been promoted to global scope so should
2953 // satisfy any reference in this module).
2954 if (!TheFnVI)
2955 continue;
2956
2957 auto *GVSummary =
2958 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
2959 if (!GVSummary)
2960 // Must have been imported, use the first summary (might be multiple if
2961 // this was a linkonce_odr).
2962 GVSummary = TheFnVI.getSummaryList().front().get();
2963
2964 // If this was an imported alias skip it as we won't have the function
2965 // summary, and it should be cloned in the original module.
2966 if (isa<AliasSummary>(GVSummary))
2967 continue;
2968
2969 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
2970
2971 if (FS->allocs().empty() && FS->callsites().empty())
2972 continue;
2973
2974 auto SI = FS->callsites().begin();
2975 auto AI = FS->allocs().begin();
2976
2977 // Assume for now that the instructions are in the exact same order
2978 // as when the summary was created, but confirm this is correct by
2979 // matching the stack ids.
2980 for (auto &BB : F) {
2981 for (auto &I : BB) {
2982 auto *CB = dyn_cast<CallBase>(&I);
2983 // Same handling as when creating module summary.
2984 if (!mayHaveMemprofSummary(CB))
2985 continue;
2986
2987 auto *CalledValue = CB->getCalledOperand();
2988 auto *CalledFunction = CB->getCalledFunction();
2989 if (CalledValue && !CalledFunction) {
2990 CalledValue = CalledValue->stripPointerCasts();
2991 // Stripping pointer casts can reveal a called function.
2992 CalledFunction = dyn_cast<Function>(CalledValue);
2993 }
2994 // Check if this is an alias to a function. If so, get the
2995 // called aliasee for the checks below.
2996 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2997 assert(!CalledFunction &&
2998 "Expected null called function in callsite for alias");
2999 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3000 }
3001
3003 I.getMetadata(LLVMContext::MD_callsite));
3004 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3005
3006 // Include allocs that were already assigned a memprof function
3007 // attribute in the statistics.
3008 if (CB->getAttributes().hasFnAttr("memprof")) {
3009 assert(!MemProfMD);
3010 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3011 ? AllocTypeColdThinBackend++
3012 : AllocTypeNotColdThinBackend++;
3013 OrigAllocsThinBackend++;
3014 AllocVersionsThinBackend++;
3015 if (!MaxAllocVersionsThinBackend)
3016 MaxAllocVersionsThinBackend = 1;
3017 // Remove any remaining callsite metadata and we can skip the rest of
3018 // the handling for this instruction, since no cloning needed.
3019 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3020 continue;
3021 }
3022
3023 if (MemProfMD) {
3024 // Consult the next alloc node.
3025 assert(AI != FS->allocs().end());
3026 auto &AllocNode = *(AI++);
3027
3028 // Sanity check that the MIB stack ids match between the summary and
3029 // instruction metadata.
3030 auto MIBIter = AllocNode.MIBs.begin();
3031 for (auto &MDOp : MemProfMD->operands()) {
3032 assert(MIBIter != AllocNode.MIBs.end());
3033 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3034 MIBIter->StackIdIndices.begin();
3035 auto *MIBMD = cast<const MDNode>(MDOp);
3036 MDNode *StackMDNode = getMIBStackNode(MIBMD);
3037 assert(StackMDNode);
3038 SmallVector<unsigned> StackIdsFromMetadata;
3039 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3040 for (auto ContextIter =
3041 StackContext.beginAfterSharedPrefix(CallsiteContext);
3042 ContextIter != StackContext.end(); ++ContextIter) {
3043 // If this is a direct recursion, simply skip the duplicate
3044 // entries, to be consistent with how the summary ids were
3045 // generated during ModuleSummaryAnalysis.
3046 if (!StackIdsFromMetadata.empty() &&
3047 StackIdsFromMetadata.back() == *ContextIter)
3048 continue;
3049 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3050 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3051 *ContextIter);
3052 StackIdIndexIter++;
3053 }
3054 MIBIter++;
3055 }
3056
3057 // Perform cloning if not yet done.
3058 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3059
3060 OrigAllocsThinBackend++;
3061 AllocVersionsThinBackend += AllocNode.Versions.size();
3062 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3063 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3064
3065 // If there is only one version that means we didn't end up
3066 // considering this function for cloning, and in that case the alloc
3067 // will still be none type or should have gotten the default NotCold.
3068 // Skip that after calling clone helper since that does some sanity
3069 // checks that confirm we haven't decided yet that we need cloning.
3070 if (AllocNode.Versions.size() == 1) {
3071 assert((AllocationType)AllocNode.Versions[0] ==
3073 (AllocationType)AllocNode.Versions[0] ==
3075 UnclonableAllocsThinBackend++;
3076 continue;
3077 }
3078
3079 // All versions should have a singular allocation type.
3080 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3081 return Type == ((uint8_t)AllocationType::NotCold |
3082 (uint8_t)AllocationType::Cold);
3083 }));
3084
3085 // Update the allocation types per the summary info.
3086 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3087 // Ignore any that didn't get an assigned allocation type.
3088 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3089 continue;
3090 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3091 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3092 : AllocTypeNotColdThinBackend++;
3093 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3094 auto A = llvm::Attribute::get(F.getContext(), "memprof",
3095 AllocTypeString);
3096 CallBase *CBClone;
3097 // Copy 0 is the original function.
3098 if (!J)
3099 CBClone = CB;
3100 else
3101 // Since VMaps are only created for new clones, we index with
3102 // clone J-1 (J==0 is the original clone and does not have a VMaps
3103 // entry).
3104 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3105 CBClone->addFnAttr(A);
3106 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3107 << ore::NV("AllocationCall", CBClone) << " in clone "
3108 << ore::NV("Caller", CBClone->getFunction())
3109 << " marked with memprof allocation attribute "
3110 << ore::NV("Attribute", AllocTypeString));
3111 }
3112 } else if (!CallsiteContext.empty()) {
3113 // Consult the next callsite node.
3114 assert(SI != FS->callsites().end());
3115 auto &StackNode = *(SI++);
3116
3117#ifndef NDEBUG
3118 // Sanity check that the stack ids match between the summary and
3119 // instruction metadata.
3120 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3121 for (auto StackId : CallsiteContext) {
3122 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3123 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3124 StackId);
3125 StackIdIndexIter++;
3126 }
3127#endif
3128
3129 // Perform cloning if not yet done.
3130 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3131
3132 // Should have skipped indirect calls via mayHaveMemprofSummary.
3133 assert(CalledFunction);
3134 assert(!IsMemProfClone(*CalledFunction));
3135
3136 // Update the calls per the summary info.
3137 // Save orig name since it gets updated in the first iteration
3138 // below.
3139 auto CalleeOrigName = CalledFunction->getName();
3140 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3141 // Do nothing if this version calls the original version of its
3142 // callee.
3143 if (!StackNode.Clones[J])
3144 continue;
3145 auto NewF = M.getOrInsertFunction(
3146 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3147 CalledFunction->getFunctionType());
3148 CallBase *CBClone;
3149 // Copy 0 is the original function.
3150 if (!J)
3151 CBClone = CB;
3152 else
3153 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3154 CBClone->setCalledFunction(NewF);
3155 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3156 << ore::NV("Call", CBClone) << " in clone "
3157 << ore::NV("Caller", CBClone->getFunction())
3158 << " assigned to call function clone "
3159 << ore::NV("Callee", NewF.getCallee()));
3160 }
3161 }
3162 // Memprof and callsite metadata on memory allocations no longer needed.
3163 I.setMetadata(LLVMContext::MD_memprof, nullptr);
3164 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3165 }
3166 }
3167 }
3168
3169 return Changed;
3170}
3171
3172template <typename DerivedCCG, typename FuncTy, typename CallTy>
3173bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3174 if (DumpCCG) {
3175 dbgs() << "CCG before cloning:\n";
3176 dbgs() << *this;
3177 }
3178 if (ExportToDot)
3179 exportToDot("postbuild");
3180
3181 if (VerifyCCG) {
3182 check();
3183 }
3184
3185 identifyClones();
3186
3187 if (VerifyCCG) {
3188 check();
3189 }
3190
3191 if (DumpCCG) {
3192 dbgs() << "CCG after cloning:\n";
3193 dbgs() << *this;
3194 }
3195 if (ExportToDot)
3196 exportToDot("cloned");
3197
3198 bool Changed = assignFunctions();
3199
3200 if (DumpCCG) {
3201 dbgs() << "CCG after assigning function clones:\n";
3202 dbgs() << *this;
3203 }
3204 if (ExportToDot)
3205 exportToDot("clonefuncassign");
3206
3207 return Changed;
3208}
3209
3210bool MemProfContextDisambiguation::processModule(
3211 Module &M,
3213
3214 // If we have an import summary, then the cloning decisions were made during
3215 // the thin link on the index. Apply them and return.
3216 if (ImportSummary)
3217 return applyImport(M);
3218
3219 // TODO: If/when other types of memprof cloning are enabled beyond just for
3220 // hot and cold, we will need to change this to individually control the
3221 // AllocationType passed to addStackNodesForMIB during CCG construction.
3222 // Note that we specifically check this after applying imports above, so that
3223 // the option isn't needed to be passed to distributed ThinLTO backend
3224 // clang processes, which won't necessarily have visibility into the linker
3225 // dependences. Instead the information is communicated from the LTO link to
3226 // the backends via the combined summary index.
3227 if (!SupportsHotColdNew)
3228 return false;
3229
3230 ModuleCallsiteContextGraph CCG(M, OREGetter);
3231 return CCG.process();
3232}
3233
3235 const ModuleSummaryIndex *Summary)
3236 : ImportSummary(Summary) {
3237 if (ImportSummary) {
3238 // The MemProfImportSummary should only be used for testing ThinLTO
3239 // distributed backend handling via opt, in which case we don't have a
3240 // summary from the pass pipeline.
3242 return;
3243 }
3244 if (MemProfImportSummary.empty())
3245 return;
3246
3247 auto ReadSummaryFile =
3249 if (!ReadSummaryFile) {
3250 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3251 "Error loading file '" + MemProfImportSummary +
3252 "': ");
3253 return;
3254 }
3255 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3256 if (!ImportSummaryForTestingOrErr) {
3257 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3258 "Error parsing file '" + MemProfImportSummary +
3259 "': ");
3260 return;
3261 }
3262 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3263 ImportSummary = ImportSummaryForTesting.get();
3264}
3265
3268 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3269 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3271 };
3272 if (!processModule(M, OREGetter))
3273 return PreservedAnalyses::all();
3274 return PreservedAnalyses::none();
3275}
3276
3280 isPrevailing) {
3281 // TODO: If/when other types of memprof cloning are enabled beyond just for
3282 // hot and cold, we will need to change this to individually control the
3283 // AllocationType passed to addStackNodesForMIB during CCG construction.
3284 // The index was set from the option, so these should be in sync.
3285 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3286 if (!SupportsHotColdNew)
3287 return;
3288
3289 IndexCallsiteContextGraph CCG(Index, isPrevailing);
3290 CCG.process();
3291}
aarch64 promote const
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
#define check(cond)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
Module.h This file contains the declarations for the Module class.
#define P(N)
Module * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
void print(OutputBuffer &OB) const
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:649
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:803
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:92
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1259
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1590
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1520
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Function summary information to aid decisions and implementation of importing.
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:517
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:404
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:591
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:168
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:55
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:962
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:75
Metadata node.
Definition: Metadata.h:1037
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:172
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:175
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:178
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
void reserve(size_type N)
Definition: SmallVector.h:667
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:105
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
StringRef AttributeString(unsigned Attribute)
Definition: Dwarf.cpp:72
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:206
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
Definition: SetOperations.h:82
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1740
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:23
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:60
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1918
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:292
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1184
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:80
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const