[llvm] [MemProf] Merge callee clones as needed before assigning functions (PR #135702)

Snehasish Kumar via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 21 11:53:42 PDT 2025


================
@@ -4016,6 +4040,311 @@ IndexCallsiteContextGraph::cloneFunctionForCallsite(
   return {Func.func(), CloneNo};
 }
 
+// We perform cloning for each allocation node separately. However, this
+// sometimes results in a situation where the same node calls multiple
+// clones of the same callee, created for different allocations. This
+// causes issues when assigning functions to these clones, as each node can
+// in reality only call a single callee clone.
+//
+// To address this, before assigning functions, merge callee clone nodes as
+// needed using a post order traversal from the allocations. We attempt to
+// use existing clones as the merge node when legal, and to share them
+// among callers with the same properties (callers calling the same set of
+// callee clone nodes for the same allocations).
+//
+// Without this fix, in some cases incorrect function assignment will lead
+// to calling the wrong allocation clone.
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
+  if (!MergeClones)
+    return;
+
+  // Generate a map from context id to the associated allocation node for use
+  // when merging clones.
+  DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
+  for (auto &Entry : AllocationCallToContextNodeMap) {
+    auto *Node = Entry.second;
+    for (auto Id : Node->getContextIds())
+      ContextIdToAllocationNode[Id] = Node->getOrigNode();
+    for (auto *Clone : Node->Clones) {
+      for (auto Id : Clone->getContextIds())
+        ContextIdToAllocationNode[Id] = Clone->getOrigNode();
+    }
+  }
+
+  // Post order traversal starting from allocations to ensure each callsite
+  // calls a single clone of its callee. Callee nodes that are clones of each
+  // other are merged (via new merge nodes if needed) to achieve this.
+  DenseSet<const ContextNode *> Visited;
+  for (auto &Entry : AllocationCallToContextNodeMap) {
+    auto *Node = Entry.second;
+
+    mergeClones(Node, Visited, ContextIdToAllocationNode);
+
+    // Make a copy so the recursive post order traversal that may create new
+    // clones doesn't mess up iteration. Note that the recursive traversal
+    // itself does not call mergeClones on any of these nodes, which are all
+    // (clones of) allocations.
+    auto Clones = Node->Clones;
+    for (auto *Clone : Clones)
+      mergeClones(Clone, Visited, ContextIdToAllocationNode);
+  }
+
+  if (DumpCCG) {
+    dbgs() << "CCG after merging:\n";
+    dbgs() << *this;
+  }
+  if (ExportToDot)
+    exportToDot("aftermerge");
+
+  if (VerifyCCG) {
+    check();
+  }
+}
+
+// Recursive helper for above mergeClones method.
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
+    ContextNode *Node, DenseSet<const ContextNode *> &Visited,
+    DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
+  auto Inserted = Visited.insert(Node);
+  if (!Inserted.second)
+    return;
+
+  // Make a copy since the recursive call may move a caller edge to a new
+  // callee, messing up the iterator.
+  auto CallerEdges = Node->CallerEdges;
+  for (auto CallerEdge : CallerEdges) {
+    // Skip any caller edge moved onto a different callee during recursion.
+    if (CallerEdge->Callee != Node)
+      continue;
+    mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
+  }
+
+  // Merge for this node after we handle its callers.
+  mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
+    ContextNode *Node, DenseSet<const ContextNode *> &Visited,
+    DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
+  // Ignore Node if we moved all of its contexts to clones.
+  if (Node->emptyContextIds())
+    return;
+
+  // First identify groups of clones among Node's callee edges, by building
+  // a map from each callee base node to the associated callee edges from Node.
+  MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
+      OrigNodeToCloneEdges;
+  for (const auto &E : Node->CalleeEdges) {
+    auto *Callee = E->Callee;
+    if (!Callee->CloneOf && Callee->Clones.empty())
+      continue;
+    ContextNode *Base = Callee->getOrigNode();
+    OrigNodeToCloneEdges[Base].push_back(E);
+  }
+
+  // Helper for callee edge sorting below. Return true if A's callee has fewer
+  // caller edges than B, or if A is a clone and B is not, or if A's first
+  // context id is smaller than B's.
+  auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
+                                     const std::shared_ptr<ContextEdge> &B) {
+    if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
+      return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
+    if (A->Callee->CloneOf && !B->Callee->CloneOf)
+      return true;
+    else if (!A->Callee->CloneOf && B->Callee->CloneOf)
+      return false;
+    // Use the first context id for each edge as a
+    // tie-breaker.
+    return *A->ContextIds.begin() < *B->ContextIds.begin();
+  };
+
+  // Process each set of callee clones called by Node, performing the needed
+  // merging.
+  for (auto Entry : OrigNodeToCloneEdges) {
+    // CalleeEdges is the set of edges from Node reaching callees that are
+    // mutual clones of each other.
+    auto &CalleeEdges = Entry.second;
+    auto NumCalleeClones = CalleeEdges.size();
+    // A single edge means there is no merging needed.
+    if (NumCalleeClones == 1)
+      continue;
+    // Sort the CalleeEdges calling this group of clones in ascending order of
+    // their caller edge counts, putting the original non-clone node first in
+    // cases of a tie. This simplifies finding an existing node to use as the
+    // merge node.
+    std::stable_sort(CalleeEdges.begin(), CalleeEdges.end(),
+                     CalleeCallerEdgeLessThan);
+
+    // Next look for other nodes that have edges to the same set of callee
+    // clones as the current Node. Those can share the eventual merge node
+    // (reducing cloning and binary size overhead) iff:
+    // - they have edges to the same set of callee clones
+    // - each callee edge reaches a subset of the same allocations as Node's
+    //   corresponding edge to the same callee clone.
+    // The second requirement is to ensure that we don't undo any of the
+    // necessary cloning to distinguish contexts with different allocation
+    // behavior.
+    // FIXME: This is somewhat conservative, as we really just need to ensure
+    // that they don't reach the same allocations as contexts on edges from Node
+    // going to any of the *other* callee clones being merged. However, that
+    // requires more tracking and checking to get right.
+    //
+    // This map counts how many edges to the same callee clone exist for other
+    // caller nodes of each callee clone.
+    DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
+    // Counts the number of other caller nodes that have edges to all callee
+    // clones that don't violate the allocation context checking.
+    unsigned PossibleOtherCallerNodes = 0;
+    // We only need to look at other Caller nodes if the first callee edge has
+    // multiple callers (recall they are sorted in ascending order above).
+    if (CalleeEdges[0]->Callee->CallerEdges.size() > 1) {
+      // For each callee edge:
+      // - Collect the count of other caller nodes calling the same callees.
+      // - Collect the alloc nodes reached by contexts on each callee edge.
+      DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
+      for (auto CalleeEdge : CalleeEdges) {
+        assert(CalleeEdge->Callee->CallerEdges.size() > 1);
+        // For each other caller of the same callee, increment the count of
+        // edges reaching the same callee clone.
+        for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
+          if (CalleeCallerEdges->Caller == Node) {
+            assert(CalleeCallerEdges == CalleeEdge);
+            continue;
+          }
+          OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
+          // If this caller edge now reaches all of the same callee clones,
+          // increment the count of candidate other caller nodes.
+          if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
+              NumCalleeClones)
+            PossibleOtherCallerNodes++;
+        }
+        // Collect the alloc nodes reached by contexts on each callee edge, for
+        // later analysis.
+        for (auto Id : CalleeEdge->getContextIds()) {
+          auto *Alloc = ContextIdToAllocationNode.lookup(Id);
+          if (!Alloc) {
+            // FIXME: unclear why this happens occasionally, presumably
+            // imperfect graph updates possibly with recursion.
+            MissingAllocForContextId++;
+            continue;
+          }
+          CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
+        }
+      }
+      // Now walk the callee edges again, and make sure that for each candidate
+      // caller node all of its edges to the callees reach the same allocs (or
+      // a subset) as those along the corresponding callee edge from Node.
+      for (auto CalleeEdge : CalleeEdges) {
+        assert(CalleeEdge->Callee->CallerEdges.size() > 1);
+        // Stop if we do not have any (more) candidate other caller nodes.
+        if (!PossibleOtherCallerNodes)
+          break;
+        auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
+        // Check each other caller of this callee clone.
+        for (auto CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
+          // Not interested in the callee edge from Node itself.
+          if (CalleeCallerE == CalleeEdge)
+            continue;
+          // Skip any callers that didn't have callee edges to all the same
+          // callee clones.
+          if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
+              NumCalleeClones)
+            continue;
+          // Make sure that each context along edge from candidate caller node
+          // reaches an allocation also reached by this callee edge from Node.
+          for (auto Id : CalleeCallerE->getContextIds()) {
+            auto *Alloc = ContextIdToAllocationNode.lookup(Id);
+            if (!Alloc)
+              continue;
+            // If not, simply reset the map entry to 0 so caller is ignored, and
+            // reduce the count of candidate other caller nodes.
+            if (!CurCalleeAllocNodes.contains(Alloc)) {
+              OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
+              PossibleOtherCallerNodes--;
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    // Now do the actual merging. Identify existing or create a new MergeNode
+    // during the first iteration. Move each callee over, along with edges from
+    // other callers we've determined above can share the same merge node.
+    ContextNode *MergeNode = nullptr;
+    DenseMap<ContextNode *, unsigned> CallerToMoveCount;
+    for (auto CalleeEdge : CalleeEdges) {
+      auto *OrigCallee = CalleeEdge->Callee;
+      // If we don't have a MergeNode yet (only happens on the first iteration,
+      // as a new one will be created when we go to move the first callee edge
+      // over as needed), see if we can use this callee.
+      if (!MergeNode) {
+        // If there are no other callers, simply use this callee.
+        if (CalleeEdge->Callee->CallerEdges.size() == 1) {
+          MergeNode = OrigCallee;
+          NonNewMergedNodes++;
+          continue;
+        }
+        // Otherwise, if we have identified other caller nodes that can share
+        // the merge node with Node, see if all of OrigCallee's callers are
+        // going to share the same merge node. In that case we can use callee
+        // (since all of its callers would move to the new merge node).
+        if (PossibleOtherCallerNodes) {
+          bool MoveAllCallerEdges = true;
+          for (auto CalleeCallerE : OrigCallee->CallerEdges) {
+            if (CalleeCallerE == CalleeEdge)
+              continue;
+            if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
+                NumCalleeClones) {
+              MoveAllCallerEdges = false;
+              break;
+            }
+          }
+          // If we are going to move all callers over, we can use this callee as
+          // the MergeNode.
+          if (MoveAllCallerEdges) {
+            MergeNode = OrigCallee;
+            NonNewMergedNodes++;
+            continue;
+          }
+        }
+      }
+      // Move this callee edge, creating a new merge node if necessary.
+      if (MergeNode) {
+        assert(MergeNode != OrigCallee);
+        moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
+                                      /*NewClone*/ false);
+      } else {
+        MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
+        NewMergedNodes++;
+      }
+      // Now move all identified edges from other callers over to the merge node
+      // as well.
+      if (PossibleOtherCallerNodes) {
+        // Make and iterate over a copy of OrigCallee's caller edges because
+        // some of these will be moved off of the OrigCallee and that would mess
+        // up the iteration from OrigCallee.
+        auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
+        for (auto CalleeCallerE : OrigCalleeCallerEdges) {
----------------
snehasish wrote:

Can this be a reference since we've already made a copy?

https://github.com/llvm/llvm-project/pull/135702


More information about the llvm-commits mailing list