[llvm] [MemProf] Re-enable cloning of callsites in recursive cycles with fixes (PR #125947)

Teresa Johnson via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 5 14:20:16 PST 2025


https://github.com/teresajohnson created https://github.com/llvm/llvm-project/pull/125947

This change addresses a number of issues with the support added by
PR121985 which were exposed through more exhaustive testing,
specifically places that needed updates to perform correct graph updates
in the presence of cycles.

A new test case is added that reproduces these issues, and the default
is flipped back to enabling this handling.


>From 2f03b9b6b48b2c29860d2a736ac779ffbaff10e8 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Wed, 5 Feb 2025 13:52:48 -0800
Subject: [PATCH] [MemProf] Re-enable cloning of callsites in recursive cycles
 with fixes

This change addresses a number of issues with the support added by
PR121985 which were exposed through more exhaustive testing,
specifically places that needed updates to perform correct graph updates
in the presence of cycles.

A new test case is added that reproduces these issues, and the default
is flipped back to enabling this handling.
---
 .../IPO/MemProfContextDisambiguation.cpp      | 176 +++++++++---
 .../test/ThinLTO/X86/memprof-icp-recursive.ll | 260 ++++++++++++++++++
 llvm/test/ThinLTO/X86/memprof-recursive.ll    |   7 +-
 3 files changed, 394 insertions(+), 49 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/memprof-icp-recursive.ll

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index b78805b3fa62a5..1bd7c10c57989b 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -124,7 +124,7 @@ static cl::opt<unsigned>
 
 // Optionally enable cloning of callsites involved with recursive cycles
 static cl::opt<bool> AllowRecursiveCallsites(
-    "memprof-allow-recursive-callsites", cl::init(false), cl::Hidden,
+    "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
     cl::desc("Allow cloning of callsites involved in recursive cycles"));
 
 // When disabled, try to detect and prevent cloning of recursive contexts.
@@ -301,12 +301,14 @@ class CallsiteContextGraph {
       // callers (i.e. if this is the leaf allocation node).
       if (!CalleeEdges.empty())
         return &CalleeEdges;
-      if (!CallerEdges.empty()) {
-        // A node with caller edges but no callee edges must be the allocation
-        // node.
-        assert(IsAllocation);
+      // Typically if the callee edges are empty either the caller edges are
+      // also empty, or this is an allocation (leaf node). However, if we are
+      // allowing recursive callsites and contexts this will be violated for
+      // incompletely cloned recursive cycles.
+      assert(CallerEdges.empty() || IsAllocation ||
+             (AllowRecursiveCallsites && AllowRecursiveContexts));
+      if (!CallerEdges.empty() && IsAllocation)
         return &CallerEdges;
-      }
       return nullptr;
     }
 
@@ -403,8 +405,13 @@ class CallsiteContextGraph {
     // True if this node was effectively removed from the graph, in which case
     // it should have an allocation type of None and empty context ids.
     bool isRemoved() const {
-      assert((AllocTypes == (uint8_t)AllocationType::None) ==
-             emptyContextIds());
+      // Typically if the callee edges are empty either the caller edges are
+      // also empty, or this is an allocation (leaf node). However, if we are
+      // allowing recursive callsites and contexts this will be violated for
+      // incompletely cloned recursive cycles.
+      assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
+             (AllocTypes == (uint8_t)AllocationType::None) ==
+                 emptyContextIds());
       return AllocTypes == (uint8_t)AllocationType::None;
     }
 
@@ -1344,16 +1351,48 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
     DenseSet<uint32_t> RemainingContextIds) {
   auto &OrigEdges =
       TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
+  DenseSet<uint32_t> RecursiveContextIds;
+  DenseSet<uint32_t> AllCallerContextIds;
+  if (AllowRecursiveCallsites) {
+    // Identify which context ids are recursive which is needed to properly
+    // update the RemainingContextIds set. The relevant recursive context ids
+    // are those that are in multiple edges.
+    for (auto &CE : OrigEdges) {
+      AllCallerContextIds.reserve(CE->getContextIds().size());
+      for (auto Id : CE->getContextIds())
+        if (!AllCallerContextIds.insert(Id).second)
+          RecursiveContextIds.insert(Id);
+    }
+  }
   // Increment iterator in loop so that we can remove edges as needed.
   for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
     auto Edge = *EI;
+    DenseSet<uint32_t> NewEdgeContextIds;
+    DenseSet<uint32_t> NotFoundContextIds;
     // Remove any matching context ids from Edge, return set that were found and
     // removed, these are the new edge's context ids. Also update the remaining
     // (not found ids).
-    DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
     set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
                  NotFoundContextIds);
-    RemainingContextIds.swap(NotFoundContextIds);
+    // Update the remaining context ids set for the later edges. This is a
+    // compile time optimization.
+    if (RecursiveContextIds.empty()) {
+      // No recursive ids, so all of the previously remaining context ids that
+      // were not seen on this edge are the new remaining set.
+      RemainingContextIds.swap(NotFoundContextIds);
+    } else {
+      // Keep the recursive ids in the remaining set as we expect to see those
+      // on another edge. We can remove the non-recursive remaining ids that
+      // were seen on this edge, however. We already have the set of remaining
+      // ids that were on this edge (in NewEdgeContextIds). Figure out which are
+      // non-recursive and only remove those. Note that despite the higher
+      // overhead of updating the remaining context ids set when recursion
+      // handling is enabled, it was found to be at worst performance neutral
+      // and in one case a clear win.
+      DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
+          set_difference(NewEdgeContextIds, RecursiveContextIds);
+      set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
+    }
     // If no matching context ids for this edge, skip it.
     if (NewEdgeContextIds.empty()) {
       ++EI;
@@ -1410,9 +1449,9 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
       set_union(CallerEdgeContextIds, Edge->ContextIds);
     }
     // Node can have more context ids than callers if some contexts terminate at
-    // node and some are longer. If we are allowing recursive callsites but
-    // haven't disabled recursive contexts, this will be violated for
-    // incompletely cloned recursive cycles, so skip the checking in that case.
+    // node and some are longer. If we are allowing recursive callsites and
+    // contexts this will be violated for incompletely cloned recursive cycles,
+    // so skip the checking in that case.
     assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
            NodeContextIds == CallerEdgeContextIds ||
            set_is_subset(CallerEdgeContextIds, NodeContextIds));
@@ -1425,7 +1464,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
       set_union(CalleeEdgeContextIds, Edge->getContextIds());
     }
-    assert(NodeContextIds == CalleeEdgeContextIds);
+    // If we are allowing recursive callsites and contexts this will be violated
+    // for incompletely cloned recursive cycles, so skip the checking in that
+    // case.
+    assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
+           NodeContextIds == CalleeEdgeContextIds);
   }
   // FIXME: Since this checking is only invoked under an option, we should
   // change the error checking from using assert to something that will trigger
@@ -3134,6 +3177,12 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
   // over to the corresponding edge into the clone (which is created here if
   // this is a newly created clone).
   for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
+    ContextNode *CalleeToUse = OldCalleeEdge->Callee;
+    // If this is a direct recursion edge, use NewCallee (the clone) as the
+    // callee as well, so that any edge updated/created here is also direct
+    // recursive.
+    if (CalleeToUse == OldCallee)
+      CalleeToUse = NewCallee;
     // The context ids moving to the new callee are the subset of this edge's
     // context ids and the context ids on the caller edge being moved.
     DenseSet<uint32_t> EdgeContextIdsToMove =
@@ -3147,8 +3196,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       // clone, specifically during function assignment, where we would have
       // removed none type edges after creating the clone. If we can't find
       // a corresponding edge there, fall through to the cloning below.
-      if (auto *NewCalleeEdge =
-              NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
+      if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
         NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
                                               EdgeContextIdsToMove.end());
         NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
@@ -3156,8 +3204,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       }
     }
     auto NewEdge = std::make_shared<ContextEdge>(
-        OldCalleeEdge->Callee, NewCallee,
-        computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
+        CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
+        EdgeContextIdsToMove);
     NewCallee->CalleeEdges.push_back(NewEdge);
     NewEdge->Callee->CallerEdges.push_back(NewEdge);
   }
@@ -3183,13 +3231,20 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
                               ContextNode *NewCaller) {
+  auto *OldCallee = Edge->Callee;
+  auto *NewCallee = OldCallee;
+  // If this edge was direct recursive, make any new/updated edge also direct
+  // recursive to NewCaller.
+  bool Recursive = Edge->Caller == Edge->Callee;
+  if (Recursive)
+    NewCallee = NewCaller;
 
   ContextNode *OldCaller = Edge->Caller;
   OldCaller->eraseCalleeEdge(Edge.get());
 
   // We might already have an edge to the new caller. If one exists we will
   // reuse it.
-  auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee);
+  auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
 
   if (ExistingEdgeToNewCaller) {
     // Since we already have an edge to NewCaller, simply move the ids
@@ -3199,11 +3254,19 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
     Edge->ContextIds.clear();
     Edge->AllocTypes = (uint8_t)AllocationType::None;
-    Edge->Callee->eraseCallerEdge(Edge.get());
+    OldCallee->eraseCallerEdge(Edge.get());
   } else {
     // Otherwise just reconnect Edge to NewCaller.
     Edge->Caller = NewCaller;
     NewCaller->CalleeEdges.push_back(Edge);
+    if (Recursive) {
+      assert(NewCallee == NewCaller);
+      // In the case of (direct) recursive edges, we update the callee as well
+      // so that it becomes recursive on the new caller.
+      Edge->Callee = NewCallee;
+      NewCallee->CallerEdges.push_back(Edge);
+      OldCallee->eraseCallerEdge(Edge.get());
+    }
     // Don't need to update Edge's context ids since we are simply
     // reconnecting it.
   }
@@ -3217,32 +3280,50 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
 #ifndef NDEBUG
   bool IsNewNode = NewCaller->CallerEdges.empty();
 #endif
-  for (auto &OldCallerEdge : OldCaller->CallerEdges) {
-    // The context ids moving to the new caller are the subset of this edge's
-    // context ids and the context ids on the callee edge being moved.
-    DenseSet<uint32_t> EdgeContextIdsToMove =
-        set_intersection(OldCallerEdge->getContextIds(), Edge->getContextIds());
-    set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
-    OldCallerEdge->AllocTypes =
-        computeAllocType(OldCallerEdge->getContextIds());
-    // In this function we expect that any pre-existing node already has edges
-    // from the same callers as the old node. That should be true in the current
-    // use case, where we will remove None-type edges after copying over all
-    // caller edges from the callee.
-    auto *ExistingCallerEdge =
-        NewCaller->findEdgeFromCaller(OldCallerEdge->Caller);
-    assert(IsNewNode || ExistingCallerEdge);
-    if (ExistingCallerEdge) {
-      ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
-                                                 EdgeContextIdsToMove.end());
-      ExistingCallerEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
-      continue;
+  // If we just moved a direct recursive edge, presumably its context ids should
+  // also flow out of OldCaller via some other non-recursive callee edge. We
+  // don't want to remove the recursive context ids from other caller edges yet,
+  // otherwise the context ids get into an inconsistent state on OldCaller.
+  // We will update these context ids on the non-recursive caller edge when and
+  // if they are updated on the non-recursive callee.
+  if (!Recursive) {
+    for (auto &OldCallerEdge : OldCaller->CallerEdges) {
+      auto OldCallerCaller = OldCallerEdge->Caller;
+      // The context ids moving to the new caller are the subset of this edge's
+      // context ids and the context ids on the callee edge being moved.
+      DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
+          OldCallerEdge->getContextIds(), Edge->getContextIds());
+      if (OldCaller == OldCallerCaller) {
+        OldCallerCaller = NewCaller;
+        // Don't actually move this one. The caller will move it directly via a
+        // call to this function with this as the Edge if it is appropriate to
+        // move to a diff node that has a matching callee (itself).
+        continue;
+      }
+      set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
+      OldCallerEdge->AllocTypes =
+          computeAllocType(OldCallerEdge->getContextIds());
+      // In this function we expect that any pre-existing node already has edges
+      // from the same callers as the old node. That should be true in the
+      // current use case, where we will remove None-type edges after copying
+      // over all caller edges from the callee.
+      auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
+      // Since we would have skipped caller edges when moving a direct recursive
+      // edge, this may not hold true when recursive handling enabled.
+      assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
+      if (ExistingCallerEdge) {
+        ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
+                                                   EdgeContextIdsToMove.end());
+        ExistingCallerEdge->AllocTypes |=
+            computeAllocType(EdgeContextIdsToMove);
+        continue;
+      }
+      auto NewEdge = std::make_shared<ContextEdge>(
+          NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
+          EdgeContextIdsToMove);
+      NewCaller->CallerEdges.push_back(NewEdge);
+      NewEdge->Caller->CalleeEdges.push_back(NewEdge);
     }
-    auto NewEdge = std::make_shared<ContextEdge>(
-        NewCaller, OldCallerEdge->Caller,
-        computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
-    NewCaller->CallerEdges.push_back(NewEdge);
-    NewEdge->Caller->CalleeEdges.push_back(NewEdge);
   }
   // Recompute the node alloc type now that its caller edges have been
   // updated (since we will compute from those edges).
@@ -3946,6 +4027,11 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
         // handling and makes it less error-prone.
         auto CloneCallerEdges = Clone->CallerEdges;
         for (auto &Edge : CloneCallerEdges) {
+          // Skip removed edges (due to direct recursive edges updated when
+          // updating callee edges when moving an edge and subsequently
+          // removed by call to removeNoneTypeCalleeEdges on the Clone.
+          if (Edge->isRemoved())
+            continue;
           // Ignore any caller that does not have a recorded callsite Call.
           if (!Edge->Caller->hasCall())
             continue;
diff --git a/llvm/test/ThinLTO/X86/memprof-icp-recursive.ll b/llvm/test/ThinLTO/X86/memprof-icp-recursive.ll
new file mode 100644
index 00000000000000..f8dcd80d4e1414
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-icp-recursive.ll
@@ -0,0 +1,260 @@
+;; This test case is the same as memprof-icp.ll but manually modified to
+;; add recursive cycles into the allocation contexts, and it has
+;; -memprof-allow-recursive-callsites enabled to test that handling.
+;; The call contexts are somewhat nonsensical, but designed to provoke
+;; several scenarios that were seen in large applications and caused
+;; failures until they were addressed. In particular, converting mutual
+;; recursion to direct recursion when stack nodes are matched onto inlined
+;; callsites; edge partitioning due to multiple calls with the same debug
+;; info (here because it is an indirect call) requiring updating/moving
+;; of recursive contexts; and the necessary updates of recursive contexts
+;; and edges during cloning.
+
+;; The checks are however not modified from the original memprof-icp.ll test:
+;; when basic recursive callsite handling is enabled (which currently doesn't
+;; clone through recursive cycles but is designed to at least allow cloning
+;; across them for the the non-recursive call sequence of the context) we
+;; should successfully get the same cloning as without any recursion.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: split-file %s %t
+
+;; For now explicitly turn on this handling, which is off by default.
+; RUN: opt -thinlto-bc %t/main.ll -enable-memprof-indirect-call-support=true >%t/main.o
+; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=true >%t/foo.o
+
+;; First perform in-process ThinLTO
+; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
+; RUN:	-enable-memprof-indirect-call-support=true \
+; RUN:  -memprof-allow-recursive-callsites \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t/foo.o,_Z3fooR2B0j,plx \
+; RUN:  -r=%t/foo.o,_ZN2B03barEj, \
+; RUN:  -r=%t/foo.o,_ZN1B3barEj, \
+; RUN:  -r=%t/main.o,_Z3fooR2B0j, \
+; RUN:  -r=%t/main.o,_Znwm, \
+; RUN:  -r=%t/main.o,_ZdlPvm, \
+; RUN:  -r=%t/main.o,_Z8externalPi, \
+; RUN:  -r=%t/main.o,main,plx \
+; RUN:  -r=%t/main.o,_ZN2B03barEj,plx \
+; RUN:  -r=%t/main.o,_ZN1B3barEj,plx \
+; RUN:  -r=%t/main.o,_ZTV1B,plx \
+; RUN:  -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
+; RUN:  -r=%t/main.o,_ZTS1B,plx \
+; RUN:  -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
+; RUN:  -r=%t/main.o,_ZTS2B0,plx \
+; RUN:  -r=%t/main.o,_ZTI2B0,plx \
+; RUN:  -r=%t/main.o,_ZTI1B,plx \
+; RUN:  -r=%t/main.o,_ZTV2B0,plx \
+; RUN:	-thinlto-threads=1 \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=. -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \
+; RUN:  --check-prefix=REMARKS
+
+; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+; REMARKS: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
+; REMARKS: created clone _ZN2B03barEj.memprof.1
+; REMARKS: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
+; REMARKS: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
+; REMARKS: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: created clone _ZN1B3barEj.memprof.1
+; REMARKS: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
+; REMARKS: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
+; REMARKS: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: created clone _Z3fooR2B0j.memprof.1
+;; In each version of foo we should have promoted the indirect call to two conditional
+;; direct calls, one to B::bar and one to B0::bar. The cloned version of foo should call
+;; the cloned versions of bar for both promotions.
+; REMARKS: Promote indirect call to _ZN1B3barEj with count 2 out of 4
+; REMARKS: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN1B3barEj
+; REMARKS: Promote indirect call to _ZN1B3barEj with count 2 out of 4
+; REMARKS: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN1B3barEj.memprof.1
+; REMARKS: Promote indirect call to _ZN2B03barEj with count 2 out of 2
+; REMARKS: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN2B03barEj
+; REMARKS: Promote indirect call to _ZN2B03barEj with count 2 out of 2
+; REMARKS: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN2B03barEj.memprof.1
+; REMARKS: created clone _ZN2B03barEj.memprof.1
+; REMARKS: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
+; REMARKS: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: created clone _ZN1B3barEj.memprof.1
+; REMARKS: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
+; REMARKS: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
+
+; STATS: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during whole program analysis
+; STATS: 8 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during whole program analysis
+; STATS: 8 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+; STATS: 5 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+
+; IR: define {{.*}} @_Z3fooR2B0j(
+; IR:   %[[R1:[0-9]+]] = icmp eq ptr %0, @_ZN1B3barEj
+; IR:   br i1 %[[R1]], label %if.true.direct_targ, label %if.false.orig_indirect
+; IR: if.true.direct_targ:
+; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]]
+; IR: if.false.orig_indirect:
+; IR:   %[[R2:[0-9]+]] = icmp eq ptr %0, @_ZN2B03barEj
+; IR:   br i1 %[[R2]], label %if.true.direct_targ1, label %if.false.orig_indirect2
+; IR: if.true.direct_targ1:
+; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]]
+; IR: if.false.orig_indirect2:
+; IR:   call {{.*}} %0
+
+; IR: define {{.*}} @_Z3fooR2B0j.memprof.1(
+;; We should still compare against the original versions of bar since that is
+;; what is in the vtable. However, we should have called the cloned versions
+;; that perform cold allocations, which were subsequently inlined.
+; IR:   %[[R3:[0-9]+]] = icmp eq ptr %0, @_ZN1B3barEj
+; IR:   br i1 %[[R3]], label %if.true.direct_targ, label %if.false.orig_indirect
+; IR: if.true.direct_targ:
+; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]]
+; IR: if.false.orig_indirect:
+; IR:   %[[R4:[0-9]+]] = icmp eq ptr %0, @_ZN2B03barEj
+; IR:   br i1 %[[R4]], label %if.true.direct_targ1, label %if.false.orig_indirect2
+; IR: if.true.direct_targ1:
+; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]]
+; IR: if.false.orig_indirect2:
+; IR:   call {{.*}} %0
+
+; IR: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold"
+; IR: attributes #[[COLD]] = {{.*}} "memprof"="cold"
+
+;--- foo.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @_ZN2B03barEj(ptr %this, i32 %s)
+declare i32 @_ZN1B3barEj(ptr %this, i32 %s)
+
+define i32 @_Z3fooR2B0j(ptr %b) {
+entry:
+  %0 = load ptr, ptr %b, align 8
+  %call = tail call i32 %0(ptr null, i32 0), !prof !0, !callsite !1
+  ret i32 0
+}
+
+!0 = !{!"VP", i32 0, i64 4, i64 4445083295448962937, i64 2, i64 -2718743882639408571, i64 2}
+!1 = !{i64 -2101080423462424381, i64 3456}
+
+;--- main.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTV1B = external constant { [3 x ptr] }
+ at _ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr]
+ at _ZTS1B = external constant [3 x i8]
+ at _ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
+ at _ZTS2B0 = external constant [4 x i8]
+ at _ZTI2B0 = external constant { ptr, ptr }
+ at _ZTI1B = external constant { ptr, ptr, ptr }
+ at _ZTV2B0 = external constant { [3 x ptr] }
+
+define i32 @main() !prof !29 {
+entry:
+  %call2 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !30
+  %call4 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !31
+  %call6 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !32
+  %call8 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !57
+  ret i32 0
+}
+
+declare i32 @_Z3fooR2B0j(ptr, i32)
+
+define i32 @_ZN2B03barEj(ptr %this, i32 %s) {
+entry:
+  %call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !33, !callsite !38
+  ;; Second allocation in this function, to ensure that indirect edges to the
+  ;; same callee are partitioned correctly.
+  %call2 = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !45, !callsite !50
+  store volatile i32 0, ptr %call, align 4
+  ret i32 0
+}
+
+declare ptr @_Znwm(i64)
+
+declare void @_Z8externalPi()
+
+declare void @_ZdlPvm()
+
+define i32 @_ZN1B3barEj(ptr %this, i32 %s) {
+entry:
+  %call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !39, !callsite !44
+  ;; Second allocation in this function, to ensure that indirect edges to the
+  ;; same callee are partitioned correctly.
+  %call2 = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !51, !callsite !56
+  store volatile i32 0, ptr %call, align 4
+  ret i32 0
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3fooR2B0j, { 3, 2, 1, 0 }
+
+attributes #0 = { builtin allocsize(0) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 13}
+!4 = !{!"MaxCount", i64 4}
+!5 = !{!"MaxInternalCount", i64 0}
+!6 = !{!"MaxFunctionCount", i64 4}
+!7 = !{!"NumCounts", i64 5}
+!8 = !{!"NumFunctions", i64 5}
+!9 = !{!"IsPartialProfile", i64 0}
+!10 = !{!"PartialProfileRatio", double 0.000000e+00}
+!11 = !{!"DetailedSummary", !12}
+!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
+!13 = !{i32 10000, i64 0, i32 0}
+!14 = !{i32 100000, i64 4, i32 2}
+!15 = !{i32 200000, i64 4, i32 2}
+!16 = !{i32 300000, i64 4, i32 2}
+!17 = !{i32 400000, i64 4, i32 2}
+!18 = !{i32 500000, i64 4, i32 2}
+!19 = !{i32 600000, i64 4, i32 2}
+!20 = !{i32 700000, i64 2, i32 4}
+!21 = !{i32 800000, i64 2, i32 4}
+!22 = !{i32 900000, i64 2, i32 4}
+!23 = !{i32 950000, i64 2, i32 4}
+!24 = !{i32 990000, i64 2, i32 4}
+!25 = !{i32 999000, i64 2, i32 4}
+!26 = !{i32 999900, i64 2, i32 4}
+!27 = !{i32 999990, i64 2, i32 4}
+!28 = !{i32 999999, i64 2, i32 4}
+!29 = !{!"function_entry_count", i64 1}
+!30 = !{i64 -6490791336773930154}
+!31 = !{i64 5188446645037944434}
+!32 = !{i64 5583420417449503557}
+!57 = !{i64 132626519179914298}
+!33 = !{!34, !36}
+!34 = !{!35, !"notcold"}
+!35 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 3456, i64 -2101080423462424381, i64 3456, i64 5188446645037944434}
+!36 = !{!37, !"cold"}
+!37 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 3456, i64 -2101080423462424381, i64 3456, i64 5583420417449503557}
+!38 = !{i64 -852997907418798798}
+!39 = !{!40, !42}
+!40 = !{!41, !"notcold"}
+!41 = !{i64 4457553070050523782, i64 -2101080423462424381, i64 3456, i64 -2101080423462424381, i64 3456, i64 132626519179914298}
+!42 = !{!43, !"cold"}
+!43 = !{i64 4457553070050523782, i64 -2101080423462424381, i64 3456, i64 -2101080423462424381, i64 3456, i64 -6490791336773930154}
+!44 = !{i64 4457553070050523782}
+!45 = !{!46, !48}
+!46 = !{!47, !"notcold"}
+!47 = !{i64 456, i64 -2101080423462424381, i64 3456, i64 5188446645037944434}
+!48 = !{!49, !"cold"}
+!49 = !{i64 456, i64 -2101080423462424381, i64 3456, i64 5583420417449503557}
+!50 = !{i64 456}
+!51 = !{!52, !54}
+!52 = !{!53, !"notcold"}
+!53 = !{i64 789, i64 -2101080423462424381, i64 3456, i64 132626519179914298}
+!54 = !{!55, !"cold"}
+!55 = !{i64 789, i64 -2101080423462424381, i64 3456, i64 -6490791336773930154}
+!56 = !{i64 789}
diff --git a/llvm/test/ThinLTO/X86/memprof-recursive.ll b/llvm/test/ThinLTO/X86/memprof-recursive.ll
index 1f4f75460b070b..5be27c3e380fc4 100644
--- a/llvm/test/ThinLTO/X86/memprof-recursive.ll
+++ b/llvm/test/ThinLTO/X86/memprof-recursive.ll
@@ -39,7 +39,7 @@
 ; RUN:  --implicit-check-not="created clone" \
 ; RUN:	--implicit-check-not="marked with memprof allocation attribute cold"
 
-;; Check the default behavior (disabled recursive callsites).
+;; Check the default behavior (enabled recursive callsites).
 ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
 ; RUN:  -supports-hot-cold-new \
 ; RUN:  -r=%t.o,_Z1Dv,plx \
@@ -49,10 +49,9 @@
 ; RUN:  -r=%t.o,_Znam, \
 ; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
 ; RUN:  -pass-remarks=memprof-context-disambiguation \
-; RUN:  -o %t.out 2>&1 | FileCheck %s --allow-empty \
+; RUN:  -o %t.out 2>&1 | FileCheck %s \
 ; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
-; RUN:  --implicit-check-not="created clone" \
-; RUN:	--implicit-check-not="marked with memprof allocation attribute cold"
+; RUN:  --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS
 
 ;; Skipping recursive contexts should prevent spurious call to cloned version of
 ;; B from the context starting at memprof_recursive.cc:19:13, which is actually



More information about the llvm-commits mailing list