[llvm] 9483ff9 - Reapply "[MemProf] Streamline and avoid unnecessary context id duplication (#107918)" (#110036)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 26 13:41:59 PDT 2024
Author: Teresa Johnson
Date: 2024-09-26T13:41:56-07:00
New Revision: 9483ff9f09e5c3d2c4b01fbb8272d0d5c7bcc042
URL: https://github.com/llvm/llvm-project/commit/9483ff9f09e5c3d2c4b01fbb8272d0d5c7bcc042
DIFF: https://github.com/llvm/llvm-project/commit/9483ff9f09e5c3d2c4b01fbb8272d0d5c7bcc042.diff
LOG: Reapply "[MemProf] Streamline and avoid unnecessary context id duplication (#107918)" (#110036)
This reverts commit 12d4769cb84b2b2e60f9776fa043c6ea16f08ebb, reapplying
524a028f69cdf25503912c396ebda7ebf0065ed2 but with fixes for failures
seen in broader testing.
Added:
llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll
Modified:
llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 576a31f8b86ae0..27049d547f6e32 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1377,9 +1377,12 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
// Compute the last node's context ids once, as it is shared by all calls in
// this entry.
DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
- assert(!LastNodeContextIds.empty());
- for (unsigned I = 0; I < Calls.size(); I++) {
+ bool PrevIterCreatedNode = false;
+ bool CreatedNode = false;
+ for (unsigned I = 0; I < Calls.size();
+ I++, PrevIterCreatedNode = CreatedNode) {
+ CreatedNode = false;
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
// Skip any for which we didn't assign any ids, these don't get a node in
// the graph.
@@ -1391,7 +1394,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
if (!CallToMatchingCall.contains(Call))
continue;
auto MatchingCall = CallToMatchingCall[Call];
- assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
+ if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
+ // This should only happen if we had a prior iteration, and it didn't
+ // create a node because of the below recomputation of context ids
+ // finding none remaining and continuing early.
+ assert(I > 0 && !PrevIterCreatedNode);
+ continue;
+ }
NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
Call);
continue;
@@ -1444,6 +1453,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
ContextNode *NewNode = NodeOwner.back().get();
NodeToCallingFunc[NewNode] = Func;
NonAllocationCallToContextNodeMap[Call] = NewNode;
+ CreatedNode = true;
NewNode->AllocTypes = computeAllocType(SavedContextIds);
ContextNode *FirstNode = getNodeForStackId(Ids[0]);
@@ -1548,13 +1558,23 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
// of length, and within each length, lexicographically by stack id. The
// latter is so that we can specially handle calls that have identical stack
// id sequences (either due to cloning or artificially because of the MIB
- // context pruning).
- std::stable_sort(Calls.begin(), Calls.end(),
- [](const CallContextInfo &A, const CallContextInfo &B) {
- return A.StackIds.size() > B.StackIds.size() ||
- (A.StackIds.size() == B.StackIds.size() &&
- A.StackIds < B.StackIds);
- });
+ // context pruning). Those with the same Ids are then sorted by function to
+ // facilitate efficiently mapping them to the same context node.
+ // Because the functions are pointers, to ensure a stable sort first assign
+ // each function pointer to its first index in the Calls array, and then use
+ // that to sort by.
+ DenseMap<const FuncTy *, unsigned> FuncToIndex;
+ for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
+ FuncToIndex.insert({CallCtxInfo.Func, Idx});
+ std::stable_sort(
+ Calls.begin(), Calls.end(),
+ [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
+ return A.StackIds.size() > B.StackIds.size() ||
+ (A.StackIds.size() == B.StackIds.size() &&
+ (A.StackIds < B.StackIds ||
+ (A.StackIds == B.StackIds &&
+ FuncToIndex[A.Func] < FuncToIndex[B.Func])));
+ });
// Find the node for the last stack id, which should be the same
// across all calls recorded for this id, and is the id for this
@@ -1572,18 +1592,26 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
assert(!LastNodeContextIds.empty());
- // Map from function to the first call from the below list (with matching
- // stack ids) found in that function. Note that calls from
diff erent
- // functions can have the same stack ids because this is the list of stack
- // ids that had (possibly pruned) nodes after building the graph from the
- // allocation MIBs.
- DenseMap<const FuncTy *, CallInfo> FuncToCallMap;
+#ifndef NDEBUG
+ // Save the set of functions seen for a particular set of the same stack
+ // ids. This is used to ensure that they have been correctly sorted to be
+ // adjacent in the Calls list, since we rely on that to efficiently place
+ // all such matching calls onto the same context node.
+ DenseSet<const FuncTy *> MatchingIdsFuncSet;
+#endif
for (unsigned I = 0; I < Calls.size(); I++) {
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
assert(SavedContextIds.empty());
assert(LastId == Ids.back());
+#ifndef NDEBUG
+ // If this call has a
diff erent set of ids than the last one, clear the
+ // set used to ensure they are sorted properly.
+ if (I > 0 && Ids != Calls[I - 1].StackIds)
+ MatchingIdsFuncSet.clear();
+#endif
+
// First compute the context ids for this stack id sequence (the
// intersection of the context ids of the corresponding nodes).
// Start with the remaining saved ids for the last node.
@@ -1652,23 +1680,38 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
continue;
}
- // If the prior call had the same stack ids this map would not be empty.
+#ifndef NDEBUG
+ // If the prior call had the same stack ids this set would not be empty.
// Check if we already have a call that "matches" because it is located
- // in the same function.
- if (FuncToCallMap.contains(Func)) {
- // Record the matching call found for this call, and skip it. We
- // will subsequently combine it into the same node.
- CallToMatchingCall[Call] = FuncToCallMap[Func];
- continue;
- }
+ // in the same function. If the Calls list was sorted properly we should
+ // not encounter this situation as all such entries should be adjacent
+ // and processed in bulk further below.
+ assert(!MatchingIdsFuncSet.contains(Func));
+
+ MatchingIdsFuncSet.insert(Func);
+#endif
// Check if the next set of stack ids is the same (since the Calls vector
// of tuples is sorted by the stack ids we can just look at the next one).
+ // If so, save them in the CallToMatchingCall map so that they get
+ // assigned to the same context node, and skip them.
bool DuplicateContextIds = false;
- if (I + 1 < Calls.size()) {
- auto &CallCtxInfo = Calls[I + 1];
+ for (unsigned J = I + 1; J < Calls.size(); J++) {
+ auto &CallCtxInfo = Calls[J];
auto &NextIds = CallCtxInfo.StackIds;
- DuplicateContextIds = Ids == NextIds;
+ if (NextIds != Ids)
+ break;
+ auto *NextFunc = CallCtxInfo.Func;
+ if (NextFunc != Func) {
+ // We have another Call with the same ids but that cannot share this
+ // node, must duplicate ids for it.
+ DuplicateContextIds = true;
+ break;
+ }
+ auto &NextCall = CallCtxInfo.Call;
+ CallToMatchingCall[NextCall] = Call;
+ // Update I so that it gets incremented correctly to skip this call.
+ I = J;
}
// If we don't have duplicate context ids, then we can assign all the
@@ -1692,14 +1735,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
set_subtract(LastNodeContextIds, StackSequenceContextIds);
if (LastNodeContextIds.empty())
break;
- // No longer possibly in a sequence of calls with duplicate stack ids,
- // clear the map.
- FuncToCallMap.clear();
- } else
- // Record the call with its function, so we can locate it the next time
- // we find a call from this function when processing the calls with the
- // same stack ids.
- FuncToCallMap[Func] = Call;
+ }
}
}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll
new file mode 100644
index 00000000000000..bf419ea987bd0a
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll
@@ -0,0 +1,102 @@
+;; This test ensures that the logic which assigns calls to stack nodes
+;; correctly handles a case where multiple nodes have stack ids that
+;; overlap with each other but have
diff erent last nodes (can happen with
+;; inlining into various levels of a call chain). Specifically, when we
+;; have one that is duplicated (e.g. unrolling), we need to correctly
+;; handle the case where the context id has already been assigned to
+;; a
diff erent call with a
diff erent last node.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN: --check-prefix=STATS --check-prefix=REMARKS
+
+; REMARKS: created clone _Z1Ab.memprof.1
+; REMARKS: created clone _Z3XZNv.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3XZNv.memprof.1
+;; Make sure the inlined context in _Z3XZNv, which partially overlaps the stack
+;; ids in the shorter inlined context of Z2XZv, correctly calls a cloned
+;; version of Z1Ab, which will call the cold annotated allocation.
+; REMARKS: call in clone _Z3XZNv.memprof.1 assigned to call function clone _Z1Ab.memprof.1
+; REMARKS: call in clone _Z1Ab.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3XZNv
+; REMARKS: call in clone _Z3XZNv assigned to call function clone _Z1Ab
+; REMARKS: call in clone _Z1Ab marked with memprof allocation attribute notcold
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @_Z1Ab() {
+entry:
+ %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #1, !memprof !0, !callsite !5
+ ret void
+}
+
+; Function Attrs: nobuiltin
+declare ptr @_Znam(i64) #0
+
+;; Inlining of stack id 2 into 3. Assume this is called from somewhere else.
+define dso_local void @_Z2XZv() local_unnamed_addr #0 {
+entry:
+ ;; Simulate duplication of the callsite (e.g. unrolling).
+ call void @_Z1Ab(), !callsite !6
+ call void @_Z1Ab(), !callsite !6
+ ret void
+}
+
+;; Inlining of stack id 2 into 3 into 4. Called by main below.
+define dso_local void @_Z3XZNv() local_unnamed_addr {
+entry:
+ call void @_Z1Ab(), !callsite !7
+ ret void
+}
+
+define dso_local noundef i32 @main() local_unnamed_addr {
+entry:
+ call void @_Z3XZNv(), !callsite !8 ;; Not cold context
+ call void @_Z3XZNv(), !callsite !9 ;; Cold context
+ ret i32 0
+}
+
+attributes #0 = { nobuiltin }
+attributes #7 = { builtin }
+
+!0 = !{!1, !3}
+;; Not cold context via first call to _Z3XZNv in main
+!1 = !{!2, !"notcold"}
+!2 = !{i64 1, i64 2, i64 3, i64 4, i64 5}
+;; Cold context via second call to _Z3XZNv in main
+!3 = !{!4, !"cold"}
+!4 = !{i64 1, i64 2, i64 3, i64 4, i64 6}
+!5 = !{i64 1}
+!6 = !{i64 2, i64 3}
+!7 = !{i64 2, i64 3, i64 4}
+!8 = !{i64 5}
+!9 = !{i64 6}
+
+; IR: define {{.*}} @_Z1Ab()
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define {{.*}} @_Z2XZv()
+; IR: call {{.*}} @_Z1Ab()
+; IR: call {{.*}} @_Z1Ab()
+; IR: define {{.*}} @_Z3XZNv()
+; IR: call {{.*}} @_Z1Ab()
+; IR: define {{.*}} @main()
+; IR: call {{.*}} @_Z3XZNv()
+; IR: call {{.*}} @_Z3XZNv.memprof.1()
+; IR: define {{.*}} @_Z1Ab.memprof.1()
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define {{.*}} @_Z3XZNv.memprof.1()
+; IR: call {{.*}} @_Z1Ab.memprof.1()
+
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
More information about the llvm-commits
mailing list