[llvm] 055e431 - [MemProf] Reduce cloning overhead by sharing nodes when possible (#99832)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 23 12:44:09 PDT 2024
Author: Teresa Johnson
Date: 2024-07-23T12:44:06-07:00
New Revision: 055e4319112282354327af9908091fdb25149e9b
URL: https://github.com/llvm/llvm-project/commit/055e4319112282354327af9908091fdb25149e9b
DIFF: https://github.com/llvm/llvm-project/commit/055e4319112282354327af9908091fdb25149e9b.diff
LOG: [MemProf] Reduce cloning overhead by sharing nodes when possible (#99832)
When assigning calls to nodes while building the graph, we can share
nodes between multiple calls in some cases. Specifically, when we
process the list of calls that had the same stack ids (possibly pruned,
because we are looking at the stack ids that actually had nodes in the
graph due to stack ids in the pruned allocation MIBs), for calls that
are located in the same function, we know that they will behave exactly
the same through cloning and function assignment. Therefore, instead of
creating nodes for all of them (requiring context id duplication), keep
a list of additional "matching calls" on the nodes. During function
assignment we simply update all the matching calls the same way as the
primary call.
This change not only reduces the number of nodes (both original and
cloned), but also greatly reduces the number of duplicated context ids
and the time to propagate them.
For a large target, I measured a 25% peak memory reduction and 42% time
reduction.
Added:
Modified:
llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 66bd786c85df5..64da3dfdd97c4 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -242,9 +242,16 @@ class CallsiteContextGraph {
// recursion.
bool Recursive = false;
- // The corresponding allocation or interior call.
+ // The corresponding allocation or interior call. This is the primary call
+ // for which we have created this node.
CallInfo Call;
+ // List of other calls that can be treated the same as the primary call
+ // through cloning. I.e. located in the same function and have the same
+ // (possibly pruned) stack ids. They will be updated the same way as the
+ // primary call when assigning to function clones.
+ std::vector<CallInfo> MatchingCalls;
+
// For alloc nodes this is a unique id assigned when constructed, and for
// callsite stack nodes it is the original stack id when the node is
// constructed from the memprof MIB metadata on the alloc nodes. Note that
@@ -457,6 +464,9 @@ class CallsiteContextGraph {
/// iteration.
MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
+ /// Records the function each call is located in.
+ DenseMap<CallInfo, const FuncTy *> CallToFunc;
+
/// Map from callsite node to the enclosing caller function.
std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
@@ -474,7 +484,8 @@ class CallsiteContextGraph {
/// StackIdToMatchingCalls map.
void assignStackNodesPostOrder(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
- DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
+ DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
+ DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
/// Duplicates the given set of context ids, updating the provided
/// map from each original id with the newly generated context ids,
@@ -1230,10 +1241,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
- assignStackNodesPostOrder(ContextNode *Node,
- DenseSet<const ContextNode *> &Visited,
- DenseMap<uint64_t, std::vector<CallContextInfo>>
- &StackIdToMatchingCalls) {
+ assignStackNodesPostOrder(
+ ContextNode *Node, DenseSet<const ContextNode *> &Visited,
+ DenseMap<uint64_t, std::vector<CallContextInfo>>
+ &StackIdToMatchingCalls,
+ DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
auto Inserted = Visited.insert(Node);
if (!Inserted.second)
return;
@@ -1246,7 +1258,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
// Skip any that have been removed during the recursion.
if (!Edge)
continue;
- assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
+ assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
+ CallToMatchingCall);
}
// If this node's stack id is in the map, update the graph to contain new
@@ -1289,8 +1302,19 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
// Skip any for which we didn't assign any ids, these don't get a node in
// the graph.
- if (SavedContextIds.empty())
+ if (SavedContextIds.empty()) {
+ // If this call has a matching call (located in the same function and
+ // having the same stack ids), simply add it to the context node created
+ // for its matching call earlier. These can be treated the same through
+ // cloning and get updated at the same time.
+ if (!CallToMatchingCall.contains(Call))
+ continue;
+ auto MatchingCall = CallToMatchingCall[Call];
+ assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
+ NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
+ Call);
continue;
+ }
assert(LastId == Ids.back());
@@ -1422,6 +1446,10 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
// there is more than one call with the same stack ids. Their (possibly newly
// duplicated) context ids are saved in the StackIdToMatchingCalls map.
DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
+ // Save a map from each call to any that are found to match it. I.e. located
+ // in the same function and have the same (possibly pruned) stack ids. We use
+ // this to avoid creating extra graph nodes as they can be treated the same.
+ DenseMap<CallInfo, CallInfo> CallToMatchingCall;
for (auto &It : StackIdToMatchingCalls) {
auto &Calls = It.getSecond();
// Skip single calls with a single stack id. These don't need a new node.
@@ -1460,6 +1488,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
assert(!LastNodeContextIds.empty());
+ // Map from function to the first call from the below list (with matching
+ // stack ids) found in that function. Note that calls from
diff erent
+ // functions can have the same stack ids because this is the list of stack
+ // ids that had (possibly pruned) nodes after building the graph from the
+ // allocation MIBs.
+ DenseMap<const FuncTy *, CallInfo> FuncToCallMap;
+
for (unsigned I = 0; I < Calls.size(); I++) {
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
assert(SavedContextIds.empty());
@@ -1533,6 +1568,18 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
continue;
}
+ const FuncTy *CallFunc = CallToFunc[Call];
+
+ // If the prior call had the same stack ids this map would not be empty.
+ // Check if we already have a call that "matches" because it is located
+ // in the same function.
+ if (FuncToCallMap.contains(CallFunc)) {
+ // Record the matching call found for this call, and skip it. We
+ // will subsequently combine it into the same node.
+ CallToMatchingCall[Call] = FuncToCallMap[CallFunc];
+ continue;
+ }
+
// Check if the next set of stack ids is the same (since the Calls vector
// of tuples is sorted by the stack ids we can just look at the next one).
bool DuplicateContextIds = false;
@@ -1562,7 +1609,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
set_subtract(LastNodeContextIds, StackSequenceContextIds);
if (LastNodeContextIds.empty())
break;
- }
+ // No longer possibly in a sequence of calls with duplicate stack ids,
+ // clear the map.
+ FuncToCallMap.clear();
+ } else
+ // Record the call with its function, so we can locate it the next time
+ // we find a call from this function when processing the calls with the
+ // same stack ids.
+ FuncToCallMap[CallFunc] = Call;
}
}
@@ -1579,7 +1633,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
// associated context ids over to the new nodes.
DenseSet<const ContextNode *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap)
- assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
+ assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
+ CallToMatchingCall);
if (VerifyCCG)
check();
}
@@ -1679,6 +1734,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
continue;
if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
CallsWithMetadata.push_back(&I);
+ CallToFunc[&I] = &F;
auto *AllocNode = addAllocNode(&I, &F);
auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
assert(CallsiteMD);
@@ -1700,8 +1756,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
I.setMetadata(LLVMContext::MD_callsite, nullptr);
}
// For callsite metadata, add to list for this function for later use.
- else if (I.getMetadata(LLVMContext::MD_callsite))
+ else if (I.getMetadata(LLVMContext::MD_callsite)) {
CallsWithMetadata.push_back(&I);
+ CallToFunc[&I] = &F;
+ }
}
}
if (!CallsWithMetadata.empty())
@@ -1756,8 +1814,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
// correlate properly in applyImport in the backends.
if (AN.MIBs.empty())
continue;
- CallsWithMetadata.push_back({&AN});
- auto *AllocNode = addAllocNode({&AN}, FS);
+ IndexCall AllocCall(&AN);
+ CallsWithMetadata.push_back(AllocCall);
+ CallToFunc[AllocCall] = FS;
+ auto *AllocNode = addAllocNode(AllocCall, FS);
// Pass an empty CallStack to the CallsiteContext (second)
// parameter, since for ThinLTO we already collapsed out the inlined
// stack ids on the allocation call during ModuleSummaryAnalysis.
@@ -1788,8 +1848,11 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
}
// For callsite metadata, add to list for this function for later use.
if (!FS->callsites().empty())
- for (auto &SN : FS->mutableCallsites())
- CallsWithMetadata.push_back({&SN});
+ for (auto &SN : FS->mutableCallsites()) {
+ IndexCall StackNodeCall(&SN);
+ CallsWithMetadata.push_back(StackNodeCall);
+ CallToFunc[StackNodeCall] = FS;
+ }
if (!CallsWithMetadata.empty())
FuncToCallsWithMetadata[FS] = CallsWithMetadata;
@@ -2225,6 +2288,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
if (Recursive)
OS << " (recursive)";
OS << "\n";
+ if (!MatchingCalls.empty()) {
+ OS << "\tMatchingCalls:\n";
+ for (auto &MatchingCall : MatchingCalls) {
+ OS << "\t";
+ MatchingCall.print(OS);
+ OS << "\n";
+ }
+ }
OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
OS << "\tContextIds:";
// Make a copy of the computed context ids that we can sort for stability.
@@ -2478,6 +2549,7 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
ContextNode *Clone = NodeOwner.back().get();
Node->addClone(Clone);
+ Clone->MatchingCalls = Node->MatchingCalls;
assert(NodeToCallingFunc.count(Node));
NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
@@ -3021,6 +3093,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
if (CallMap.count(Call))
CallClone = CallMap[Call];
CallsiteClone->setCall(CallClone);
+ // Need to do the same for all matching calls.
+ for (auto &MatchingCall : Node->MatchingCalls) {
+ CallInfo CallClone(MatchingCall);
+ if (CallMap.count(MatchingCall))
+ CallClone = CallMap[MatchingCall];
+ // Updates the call in the list.
+ MatchingCall = CallClone;
+ }
};
// Keep track of the clones of callsite Node that need to be assigned to
@@ -3187,6 +3267,16 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
CallInfo NewCall(CallMap[OrigCall]);
assert(NewCall);
NewClone->setCall(NewCall);
+ // Need to do the same for all matching calls.
+ for (auto &MatchingCall : NewClone->MatchingCalls) {
+ CallInfo OrigMatchingCall(MatchingCall);
+ OrigMatchingCall.setCloneNo(0);
+ assert(CallMap.count(OrigMatchingCall));
+ CallInfo NewCall(CallMap[OrigMatchingCall]);
+ assert(NewCall);
+ // Updates the call in the list.
+ MatchingCall = NewCall;
+ }
}
}
// Fall through to handling below to perform the recording of the
@@ -3373,6 +3463,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
if (Node->IsAllocation) {
updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
+ assert(Node->MatchingCalls.empty());
return;
}
@@ -3381,6 +3472,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
updateCall(Node->Call, CalleeFunc);
+ // Update all the matching calls as well.
+ for (auto &Call : Node->MatchingCalls)
+ updateCall(Call, CalleeFunc);
};
// Performs DFS traversal starting from allocation nodes to update calls to
More information about the llvm-commits
mailing list