[llvm] Reapply "[MemProf] Streamline and avoid unnecessary context id duplication (#107918)" (PR #110036)

via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 25 13:01:23 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-transforms

Author: Teresa Johnson (teresajohnson)

<details>
<summary>Changes</summary>

- **Reapply "[MemProf] Streamline and avoid unnecessary context id duplication (#<!-- -->107918)" (#<!-- -->108652)**
- **The actual fixes and a test case that provokes the original issue.**


---
Full diff: https://github.com/llvm/llvm-project/pull/110036.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp (+76-33) 
- (added) llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll (+102) 


``````````diff
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 576a31f8b86ae0..aa5e021cb7e4c1 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1379,7 +1379,15 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
   DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
   assert(!LastNodeContextIds.empty());
 
+#ifndef NDEBUG
+  bool PrevIterCreatedNode = false;
+  bool CreatedNode = false;
+  for (unsigned I = 0; I < Calls.size();
+       I++, PrevIterCreatedNode = CreatedNode) {
+    CreatedNode = false;
+#else
   for (unsigned I = 0; I < Calls.size(); I++) {
+#endif
     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
     // Skip any for which we didn't assign any ids, these don't get a node in
     // the graph.
@@ -1391,7 +1399,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       if (!CallToMatchingCall.contains(Call))
         continue;
       auto MatchingCall = CallToMatchingCall[Call];
-      assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
+      if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
+        // This should only happen if we had a prior iteration, and it didn't
+        // create a node because of the below recomputation of context ids
+        // finding none remaining and continuing early.
+        assert(I > 0 && !PrevIterCreatedNode);
+        continue;
+      }
       NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
           Call);
       continue;
@@ -1444,6 +1458,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     ContextNode *NewNode = NodeOwner.back().get();
     NodeToCallingFunc[NewNode] = Func;
     NonAllocationCallToContextNodeMap[Call] = NewNode;
+#ifndef NDEBUG
+    CreatedNode = true;
+#endif
     NewNode->AllocTypes = computeAllocType(SavedContextIds);
 
     ContextNode *FirstNode = getNodeForStackId(Ids[0]);
@@ -1548,13 +1565,23 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     // of length, and within each length, lexicographically by stack id. The
     // latter is so that we can specially handle calls that have identical stack
     // id sequences (either due to cloning or artificially because of the MIB
-    // context pruning).
-    std::stable_sort(Calls.begin(), Calls.end(),
-                     [](const CallContextInfo &A, const CallContextInfo &B) {
-                       return A.StackIds.size() > B.StackIds.size() ||
-                              (A.StackIds.size() == B.StackIds.size() &&
-                               A.StackIds < B.StackIds);
-                     });
+    // context pruning). Those with the same Ids are then sorted by function to
+    // facilitate efficiently mapping them to the same context node.
+    // Because the functions are pointers, to ensure a stable sort first assign
+    // each function pointer to its first index in the Calls array, and then use
+    // that to sort by.
+    DenseMap<const FuncTy *, unsigned> FuncToIndex;
+    for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
+      FuncToIndex.insert({CallCtxInfo.Func, Idx});
+    std::stable_sort(
+        Calls.begin(), Calls.end(),
+        [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
+          return A.StackIds.size() > B.StackIds.size() ||
+                 (A.StackIds.size() == B.StackIds.size() &&
+                  (A.StackIds < B.StackIds ||
+                   (A.StackIds == B.StackIds &&
+                    FuncToIndex[A.Func] < FuncToIndex[B.Func])));
+        });
 
     // Find the node for the last stack id, which should be the same
     // across all calls recorded for this id, and is the id for this
@@ -1572,18 +1599,26 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
     assert(!LastNodeContextIds.empty());
 
-    // Map from function to the first call from the below list (with matching
-    // stack ids) found in that function. Note that calls from different
-    // functions can have the same stack ids because this is the list of stack
-    // ids that had (possibly pruned) nodes after building the graph from the
-    // allocation MIBs.
-    DenseMap<const FuncTy *, CallInfo> FuncToCallMap;
+#ifndef NDEBUG
+    // Save the set of functions seen for a particular set of the same stack
+    // ids. This is used to ensure that they have been correctly sorted to be
+    // adjacent in the Calls list, since we rely on that to efficiently place
+    // all such matching calls onto the same context node.
+    DenseSet<const FuncTy *> MatchingIdsFuncSet;
+#endif
 
     for (unsigned I = 0; I < Calls.size(); I++) {
       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
       assert(SavedContextIds.empty());
       assert(LastId == Ids.back());
 
+#ifndef NDEBUG
+      // If this call has a different set of ids than the last one, clear the
+      // set used to ensure they are sorted properly.
+      if (I > 0 && Ids != Calls[I - 1].StackIds)
+        MatchingIdsFuncSet.clear();
+#endif
+
       // First compute the context ids for this stack id sequence (the
       // intersection of the context ids of the corresponding nodes).
       // Start with the remaining saved ids for the last node.
@@ -1652,23 +1687,38 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
           continue;
       }
 
-      // If the prior call had the same stack ids this map would not be empty.
+#ifndef NDEBUG
+      // If the prior call had the same stack ids this set would not be empty.
       // Check if we already have a call that "matches" because it is located
-      // in the same function.
-      if (FuncToCallMap.contains(Func)) {
-        // Record the matching call found for this call, and skip it. We
-        // will subsequently combine it into the same node.
-        CallToMatchingCall[Call] = FuncToCallMap[Func];
-        continue;
-      }
+      // in the same function. If the Calls list was sorted properly we should
+      // not encounter this situation as all such entries should be adjacent
+      // and processed in bulk further below.
+      assert(!MatchingIdsFuncSet.contains(Func));
+
+      MatchingIdsFuncSet.insert(Func);
+#endif
 
       // Check if the next set of stack ids is the same (since the Calls vector
       // of tuples is sorted by the stack ids we can just look at the next one).
+      // If so, save them in the CallToMatchingCall map so that they get
+      // assigned to the same context node, and skip them.
       bool DuplicateContextIds = false;
-      if (I + 1 < Calls.size()) {
-        auto &CallCtxInfo = Calls[I + 1];
+      for (unsigned J = I + 1; J < Calls.size(); J++) {
+        auto &CallCtxInfo = Calls[J];
         auto &NextIds = CallCtxInfo.StackIds;
-        DuplicateContextIds = Ids == NextIds;
+        if (NextIds != Ids)
+          break;
+        auto *NextFunc = CallCtxInfo.Func;
+        if (NextFunc != Func) {
+          // We have another Call with the same ids but that cannot share this
+          // node, must duplicate ids for it.
+          DuplicateContextIds = true;
+          break;
+        }
+        auto &NextCall = CallCtxInfo.Call;
+        CallToMatchingCall[NextCall] = Call;
+        // Update I so that it gets incremented correctly to skip this call.
+        I = J;
       }
 
       // If we don't have duplicate context ids, then we can assign all the
@@ -1692,14 +1742,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
         set_subtract(LastNodeContextIds, StackSequenceContextIds);
         if (LastNodeContextIds.empty())
           break;
-        // No longer possibly in a sequence of calls with duplicate stack ids,
-        // clear the map.
-        FuncToCallMap.clear();
-      } else
-        // Record the call with its function, so we can locate it the next time
-        // we find a call from this function when processing the calls with the
-        // same stack ids.
-        FuncToCallMap[Func] = Call;
+      }
     }
   }
 
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll
new file mode 100644
index 00000000000000..29071652c66c18
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined4.ll
@@ -0,0 +1,102 @@
+;; This test ensures that the logic which assigns calls to stack nodes
+;; correctly handles a case where multiple nodes have stack ids that
+;; overlap with each other but have different last nodes (can happen with
+;; inlining into various levels of a call chain). Specifically, when we
+;; have one that is duplicated (e.g. unrolling), we need to correctly
+;; handle the case where the context id has already been assigned to
+;; a different call with a different last node.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
+
+; REMARKS: created clone _Z1Ab.memprof.1
+; REMARKS: created clone _Z3XZNv.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3XZNv.memprof.1
+;; Make sure the inlined context in _Z3XZNv, which partially overlaps the stack
+;; ids in the shorter inlined context of Z2XZv, correctly calls a cloned
+;; version of Z1Ab, which will call the cold annotated allocation.
+; REMARKS: call in clone _Z3XZNv.memprof.1 assigned to call function clone _Z1Ab.memprof.1
+; REMARKS: call in clone _Z1Ab.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3XZNv
+; REMARKS: call in clone _Z3XZNv assigned to call function clone _Z1Ab
+; REMARKS: call in clone _Z1Ab marked with memprof allocation attribute notcold
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @_Z1Ab() {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #1, !memprof !0, !callsite !5
+  ret void
+}
+
+; Function Attrs: nobuiltin
+declare ptr @_Znam(i64) #0
+
+;; Inlining of stack id 2 into 3. Assume this is called from somewhere else.
+define dso_local void @_Z2XZv() local_unnamed_addr #0 {
+entry:
+  ;; Simulate duplication of the callsite (e.g. unrolling).
+  tail call void @_Z1Ab(), !callsite !6
+  tail call void @_Z1Ab(), !callsite !6
+  ret void
+}
+
+;; Inlining of stack id 2 into 3 into 4. Called by main below.
+define dso_local void @_Z3XZNv() local_unnamed_addr {
+entry:
+  tail call void @_Z1Ab(), !callsite !7
+  ret void
+}
+
+define dso_local noundef i32 @main() local_unnamed_addr {
+entry:
+  tail call void @_Z3XZNv(), !callsite !8 ;; Not cold context
+  tail call void @_Z3XZNv(), !callsite !9 ;; Cold context
+  ret i32 0
+}
+
+attributes #0 = { nobuiltin }
+attributes #7 = { builtin }
+
+!0 = !{!1, !3}
+;; Not cold context via first call to _Z3XZNv in main
+!1 = !{!2, !"notcold"}
+!2 = !{i64 1, i64 2, i64 3, i64 4, i64 5}
+;; Cold context via second call to _Z3XZNv in main
+!3 = !{!4, !"cold"}
+!4 = !{i64 1, i64 2, i64 3, i64 4, i64 6}
+!5 = !{i64 1}
+!6 = !{i64 2, i64 3}
+!7 = !{i64 2, i64 3, i64 4}
+!8 = !{i64 5}
+!9 = !{i64 6}
+
+; IR: define {{.*}} @_Z1Ab()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define {{.*}} @_Z2XZv()
+; IR:   call {{.*}} @_Z1Ab()
+; IR:   call {{.*}} @_Z1Ab()
+; IR: define {{.*}} @_Z3XZNv()
+; IR:   call {{.*}} @_Z1Ab()
+; IR: define {{.*}} @main()
+; IR:   call {{.*}} @_Z3XZNv()
+; IR:   call {{.*}} @_Z3XZNv.memprof.1()
+; IR: define {{.*}} @_Z1Ab.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define {{.*}} @_Z3XZNv.memprof.1()
+; IR:   call {{.*}} @_Z1Ab.memprof.1()
+
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis

``````````

</details>


https://github.com/llvm/llvm-project/pull/110036


More information about the llvm-commits mailing list