[llvm] 5b6f151 - [SampleFDO] Improve stale profile matching by diff algorithm (#87375)

via llvm-commits llvm-commits at lists.llvm.org
Mon May 13 16:01:33 PDT 2024


Author: Lei Wang
Date: 2024-05-13T16:01:29-07:00
New Revision: 5b6f15110422f4955212bd26a96057972e3304ad

URL: https://github.com/llvm/llvm-project/commit/5b6f15110422f4955212bd26a96057972e3304ad
DIFF: https://github.com/llvm/llvm-project/commit/5b6f15110422f4955212bd26a96057972e3304ad.diff

LOG: [SampleFDO] Improve stale profile matching by diff algorithm (#87375)

This change improves the matching algorithm by using the diff algorithm,
the current matching algorithm only processes the callsites grouped by
the same name functions, it doesn't consider the order relationships
between different name functions, this sometimes fails to handle this
ambiguous anchor case. For example. (`Foo:1` means a
calliste[callee_name: callsite_location])
```
IR :      foo:1  bar:2  foo:4  bar:5 
Profile :        bar:3  foo:5  bar:6
```
The `foo:1` is matched to the 2nd `foo:5` and using the diff
algorithm(finding longest common subsequence ) can help on this issue.
One well-known diff algorithm is the Myers diff algorithm(paper "An
O(ND) Difference Algorithm and Its Variations∗" Eugene W. Myers), its
variations have been implemented and used in many famous tools, like the
GNU diff or git diff. It provides an efficient way to find the longest
common subsequence or the shortest edit script through graph searching.
There are several variations/refinements for the algorithm, but as in
our case, the num of function callsites is usually very small, so we
implemented the basic greedy version in this change which should be good
enough.
We observed better matchings and positive perf improvement on our
internal services.

Added: 
    llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-matching-LCS.prof
    llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll

Modified: 
    llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
    llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
    llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
index 7ae6194da7c9c..b6feca5d47035 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
@@ -19,6 +19,9 @@
 
 namespace llvm {
 
+using AnchorList = std::vector<std::pair<LineLocation, FunctionId>>;
+using AnchorMap = std::map<LineLocation, FunctionId>;
+
 // Sample profile matching - fuzzy match.
 class SampleProfileMatcher {
   Module &M;
@@ -27,8 +30,8 @@ class SampleProfileMatcher {
   const ThinOrFullLTOPhase LTOPhase;
   SampleProfileMap FlattenedProfiles;
   // For each function, the matcher generates a map, of which each entry is a
-  // mapping from the source location of current build to the source location in
-  // the profile.
+  // mapping from the source location of current build to the source location
+  // in the profile.
   StringMap<LocToLocMap> FuncMappings;
 
   // Match state for an anchor/callsite.
@@ -95,18 +98,13 @@ class SampleProfileMatcher {
     return nullptr;
   }
   void runOnFunction(Function &F);
-  void findIRAnchors(const Function &F,
-                     std::map<LineLocation, StringRef> &IRAnchors);
-  void findProfileAnchors(
-      const FunctionSamples &FS,
-      std::map<LineLocation, std::unordered_set<FunctionId>> &ProfileAnchors);
+  void findIRAnchors(const Function &F, AnchorMap &IRAnchors);
+  void findProfileAnchors(const FunctionSamples &FS, AnchorMap &ProfileAnchors);
   // Record the callsite match states for profile staleness report, the result
   // is saved in FuncCallsiteMatchStates.
-  void recordCallsiteMatchStates(
-      const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
-      const std::map<LineLocation, std::unordered_set<FunctionId>>
-          &ProfileAnchors,
-      const LocToLocMap *IRToProfileLocationMap);
+  void recordCallsiteMatchStates(const Function &F, const AnchorMap &IRAnchors,
+                                 const AnchorMap &ProfileAnchors,
+                                 const LocToLocMap *IRToProfileLocationMap);
 
   bool isMismatchState(const enum MatchState &State) {
     return State == MatchState::InitialMismatch ||
@@ -143,11 +141,25 @@ class SampleProfileMatcher {
   }
   void distributeIRToProfileLocationMap();
   void distributeIRToProfileLocationMap(FunctionSamples &FS);
-  void runStaleProfileMatching(
-      const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
-      const std::map<LineLocation, std::unordered_set<FunctionId>>
-          &ProfileAnchors,
-      LocToLocMap &IRToProfileLocationMap);
+  // This function implements the Myers 
diff  algorithm used for stale profile
+  // matching. The algorithm provides a simple and efficient way to find the
+  // Longest Common Subsequence(LCS) or the Shortest Edit Script(SES) of two
+  // sequences. For more details, refer to the paper 'An O(ND) Difference
+  // Algorithm and Its Variations' by Eugene W. Myers.
+  // In the scenario of profile fuzzy matching, the two sequences are the IR
+  // callsite anchors and profile callsite anchors. The subsequence equivalent
+  // parts from the resulting SES are used to remap the IR locations to the
+  // profile locations. As the number of function callsite is usually not big,
+  // we currently just implements the basic greedy version(page 6 of the paper).
+  LocToLocMap
+  longestCommonSequence(const AnchorList &IRCallsiteAnchors,
+                        const AnchorList &ProfileCallsiteAnchors) const;
+  void matchNonCallsiteLocs(const LocToLocMap &AnchorMatchings,
+                            const AnchorMap &IRAnchors,
+                            LocToLocMap &IRToProfileLocationMap);
+  void runStaleProfileMatching(const Function &F, const AnchorMap &IRAnchors,
+                               const AnchorMap &ProfileAnchors,
+                               LocToLocMap &IRToProfileLocationMap);
   void reportOrPersistProfileStats();
 };
 } // end namespace llvm

diff  --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
index 142660bcc58e3..d7613bce4c52e 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
@@ -24,8 +24,8 @@ extern cl::opt<bool> SalvageStaleProfile;
 extern cl::opt<bool> PersistProfileStaleness;
 extern cl::opt<bool> ReportProfileStaleness;
 
-void SampleProfileMatcher::findIRAnchors(
-    const Function &F, std::map<LineLocation, StringRef> &IRAnchors) {
+void SampleProfileMatcher::findIRAnchors(const Function &F,
+                                         AnchorMap &IRAnchors) {
   // For inlined code, recover the original callsite and callee by finding the
   // top-level inline frame. e.g. For frame stack "main:1 @ foo:2 @ bar:3", the
   // top-level frame is "main:1", the callsite is "1" and the callee is "foo".
@@ -40,7 +40,7 @@ void SampleProfileMatcher::findIRAnchors(
     LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(
         DIL, FunctionSamples::ProfileIsFS);
     StringRef CalleeName = PrevDIL->getSubprogramLinkageName();
-    return std::make_pair(Callsite, CalleeName);
+    return std::make_pair(Callsite, FunctionId(CalleeName));
   };
 
   auto GetCanonicalCalleeName = [](const CallBase *CB) {
@@ -70,7 +70,8 @@ void SampleProfileMatcher::findIRAnchors(
               if (!isa<IntrinsicInst>(&I))
                 CalleeName = GetCanonicalCalleeName(CB);
             }
-            IRAnchors.emplace(LineLocation(Probe->Id, 0), CalleeName);
+            LineLocation Loc = LineLocation(Probe->Id, 0);
+            IRAnchors.emplace(Loc, FunctionId(CalleeName));
           }
         }
       } else {
@@ -86,84 +87,127 @@ void SampleProfileMatcher::findIRAnchors(
           LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(
               DIL, FunctionSamples::ProfileIsFS);
           StringRef CalleeName = GetCanonicalCalleeName(dyn_cast<CallBase>(&I));
-          IRAnchors.emplace(Callsite, CalleeName);
+          IRAnchors.emplace(Callsite, FunctionId(CalleeName));
         }
       }
     }
   }
 }
 
-void SampleProfileMatcher::findProfileAnchors(
-    const FunctionSamples &FS,
-    std::map<LineLocation, std::unordered_set<FunctionId>> &ProfileAnchors) {
+void SampleProfileMatcher::findProfileAnchors(const FunctionSamples &FS,
+                                              AnchorMap &ProfileAnchors) {
   auto isInvalidLineOffset = [](uint32_t LineOffset) {
     return LineOffset & 0x8000;
   };
 
+  auto InsertAnchor = [](const LineLocation &Loc, const FunctionId &CalleeName,
+                         AnchorMap &ProfileAnchors) {
+    auto Ret = ProfileAnchors.try_emplace(Loc, CalleeName);
+    if (!Ret.second) {
+      // For multiple callees, which indicates it's an indirect call, we use a
+      // dummy name(UnknownIndirectCallee) as the indrect callee name.
+      Ret.first->second = FunctionId(UnknownIndirectCallee);
+    }
+  };
+
   for (const auto &I : FS.getBodySamples()) {
     const LineLocation &Loc = I.first;
     if (isInvalidLineOffset(Loc.LineOffset))
       continue;
-    for (const auto &I : I.second.getCallTargets()) {
-      auto Ret =
-          ProfileAnchors.try_emplace(Loc, std::unordered_set<FunctionId>());
-      Ret.first->second.insert(I.first);
-    }
+    for (const auto &C : I.second.getCallTargets())
+      InsertAnchor(Loc, C.first, ProfileAnchors);
   }
 
   for (const auto &I : FS.getCallsiteSamples()) {
     const LineLocation &Loc = I.first;
     if (isInvalidLineOffset(Loc.LineOffset))
       continue;
-    const auto &CalleeMap = I.second;
-    for (const auto &I : CalleeMap) {
-      auto Ret =
-          ProfileAnchors.try_emplace(Loc, std::unordered_set<FunctionId>());
-      Ret.first->second.insert(I.first);
-    }
+    for (const auto &C : I.second)
+      InsertAnchor(Loc, C.first, ProfileAnchors);
   }
 }
 
-// Call target name anchor based profile fuzzy matching.
-// Input:
-// For IR locations, the anchor is the callee name of direct callsite; For
-// profile locations, it's the call target name for BodySamples or inlinee's
-// profile name for CallsiteSamples.
-// Matching heuristic:
-// First match all the anchors in lexical order, then split the non-anchor
-// locations between the two anchors evenly, first half are matched based on the
-// start anchor, second half are matched based on the end anchor.
-// For example, given:
-// IR locations:      [1, 2(foo), 3, 5, 6(bar), 7]
-// Profile locations: [1, 2, 3(foo), 4, 7, 8(bar), 9]
-// The matching gives:
-//   [1,    2(foo), 3,  5,  6(bar), 7]
-//    |     |       |   |     |     |
-//   [1, 2, 3(foo), 4,  7,  8(bar), 9]
-// The output mapping: [2->3, 3->4, 5->7, 6->8, 7->9].
-void SampleProfileMatcher::runStaleProfileMatching(
-    const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
-    const std::map<LineLocation, std::unordered_set<FunctionId>>
-        &ProfileAnchors,
-    LocToLocMap &IRToProfileLocationMap) {
-  LLVM_DEBUG(dbgs() << "Run stale profile matching for " << F.getName()
-                    << "\n");
-  assert(IRToProfileLocationMap.empty() &&
-         "Run stale profile matching only once per function");
+LocToLocMap SampleProfileMatcher::longestCommonSequence(
+    const AnchorList &AnchorList1, const AnchorList &AnchorList2) const {
+  int32_t Size1 = AnchorList1.size(), Size2 = AnchorList2.size(),
+          MaxDepth = Size1 + Size2;
+  auto Index = [&](int32_t I) { return I + MaxDepth; };
+
+  LocToLocMap EqualLocations;
+  if (MaxDepth == 0)
+    return EqualLocations;
+
+  // Backtrack the SES result.
+  auto Backtrack = [&](const std::vector<std::vector<int32_t>> &Trace,
+                       const AnchorList &AnchorList1,
+                       const AnchorList &AnchorList2,
+                       LocToLocMap &EqualLocations) {
+    int32_t X = Size1, Y = Size2;
+    for (int32_t Depth = Trace.size() - 1; X > 0 || Y > 0; Depth--) {
+      const auto &P = Trace[Depth];
+      int32_t K = X - Y;
+      int32_t PrevK = K;
+      if (K == -Depth || (K != Depth && P[Index(K - 1)] < P[Index(K + 1)]))
+        PrevK = K + 1;
+      else
+        PrevK = K - 1;
+
+      int32_t PrevX = P[Index(PrevK)];
+      int32_t PrevY = PrevX - PrevK;
+      while (X > PrevX && Y > PrevY) {
+        X--;
+        Y--;
+        EqualLocations.insert({AnchorList1[X].first, AnchorList2[Y].first});
+      }
 
-  std::unordered_map<FunctionId, std::set<LineLocation>> CalleeToCallsitesMap;
-  for (const auto &I : ProfileAnchors) {
-    const auto &Loc = I.first;
-    const auto &Callees = I.second;
-    // Filter out possible indirect calls, use direct callee name as anchor.
-    if (Callees.size() == 1) {
-      FunctionId CalleeName = *Callees.begin();
-      const auto &Candidates = CalleeToCallsitesMap.try_emplace(
-          CalleeName, std::set<LineLocation>());
-      Candidates.first->second.insert(Loc);
+      if (Depth == 0)
+        break;
+
+      if (Y == PrevY)
+        X--;
+      else if (X == PrevX)
+        Y--;
+      X = PrevX;
+      Y = PrevY;
+    }
+  };
+
+  // The greedy LCS/SES algorithm.
+
+  // An array contains the endpoints of the furthest reaching D-paths.
+  std::vector<int32_t> V(2 * MaxDepth + 1, -1);
+  V[Index(1)] = 0;
+  // Trace is used to backtrack the SES result.
+  std::vector<std::vector<int32_t>> Trace;
+  for (int32_t Depth = 0; Depth <= MaxDepth; Depth++) {
+    Trace.push_back(V);
+    for (int32_t K = -Depth; K <= Depth; K += 2) {
+      int32_t X = 0, Y = 0;
+      if (K == -Depth || (K != Depth && V[Index(K - 1)] < V[Index(K + 1)]))
+        X = V[Index(K + 1)];
+      else
+        X = V[Index(K - 1)] + 1;
+      Y = X - K;
+      while (X < Size1 && Y < Size2 &&
+             AnchorList1[X].second == AnchorList2[Y].second)
+        X++, Y++;
+
+      V[Index(K)] = X;
+
+      if (X >= Size1 && Y >= Size2) {
+        // Length of an SES is D.
+        Backtrack(Trace, AnchorList1, AnchorList2, EqualLocations);
+        return EqualLocations;
+      }
     }
   }
+  // Length of an SES is greater than MaxDepth.
+  return EqualLocations;
+}
 
+void SampleProfileMatcher::matchNonCallsiteLocs(
+    const LocToLocMap &MatchedAnchors, const AnchorMap &IRAnchors,
+    LocToLocMap &IRToProfileLocationMap) {
   auto InsertMatching = [&](const LineLocation &From, const LineLocation &To) {
     // Skip the unchanged location mapping to save memory.
     if (From != To)
@@ -173,43 +217,35 @@ void SampleProfileMatcher::runStaleProfileMatching(
   // Use function's beginning location as the initial anchor.
   int32_t LocationDelta = 0;
   SmallVector<LineLocation> LastMatchedNonAnchors;
-
   for (const auto &IR : IRAnchors) {
     const auto &Loc = IR.first;
-    auto CalleeName = IR.second;
     bool IsMatchedAnchor = false;
     // Match the anchor location in lexical order.
-    if (!CalleeName.empty()) {
-      auto CandidateAnchors =
-          CalleeToCallsitesMap.find(getRepInFormat(CalleeName));
-      if (CandidateAnchors != CalleeToCallsitesMap.end() &&
-          !CandidateAnchors->second.empty()) {
-        auto CI = CandidateAnchors->second.begin();
-        const auto Candidate = *CI;
-        CandidateAnchors->second.erase(CI);
-        InsertMatching(Loc, Candidate);
-        LLVM_DEBUG(dbgs() << "Callsite with callee:" << CalleeName
-                          << " is matched from " << Loc << " to " << Candidate
-                          << "\n");
-        LocationDelta = Candidate.LineOffset - Loc.LineOffset;
-
-        // Match backwards for non-anchor locations.
-        // The locations in LastMatchedNonAnchors have been matched forwards
-        // based on the previous anchor, spilt it evenly and overwrite the
-        // second half based on the current anchor.
-        for (size_t I = (LastMatchedNonAnchors.size() + 1) / 2;
-             I < LastMatchedNonAnchors.size(); I++) {
-          const auto &L = LastMatchedNonAnchors[I];
-          uint32_t CandidateLineOffset = L.LineOffset + LocationDelta;
-          LineLocation Candidate(CandidateLineOffset, L.Discriminator);
-          InsertMatching(L, Candidate);
-          LLVM_DEBUG(dbgs() << "Location is rematched backwards from " << L
-                            << " to " << Candidate << "\n");
-        }
-
-        IsMatchedAnchor = true;
-        LastMatchedNonAnchors.clear();
+    auto R = MatchedAnchors.find(Loc);
+    if (R != MatchedAnchors.end()) {
+      const auto &Candidate = R->second;
+      InsertMatching(Loc, Candidate);
+      LLVM_DEBUG(dbgs() << "Callsite with callee:" << IR.second.stringRef()
+                        << " is matched from " << Loc << " to " << Candidate
+                        << "\n");
+      LocationDelta = Candidate.LineOffset - Loc.LineOffset;
+
+      // Match backwards for non-anchor locations.
+      // The locations in LastMatchedNonAnchors have been matched forwards
+      // based on the previous anchor, spilt it evenly and overwrite the
+      // second half based on the current anchor.
+      for (size_t I = (LastMatchedNonAnchors.size() + 1) / 2;
+           I < LastMatchedNonAnchors.size(); I++) {
+        const auto &L = LastMatchedNonAnchors[I];
+        uint32_t CandidateLineOffset = L.LineOffset + LocationDelta;
+        LineLocation Candidate(CandidateLineOffset, L.Discriminator);
+        InsertMatching(L, Candidate);
+        LLVM_DEBUG(dbgs() << "Location is rematched backwards from " << L
+                          << " to " << Candidate << "\n");
       }
+
+      IsMatchedAnchor = true;
+      LastMatchedNonAnchors.clear();
     }
 
     // Match forwards for non-anchor locations.
@@ -224,6 +260,57 @@ void SampleProfileMatcher::runStaleProfileMatching(
   }
 }
 
+// Call target name anchor based profile fuzzy matching.
+// Input:
+// For IR locations, the anchor is the callee name of direct callsite; For
+// profile locations, it's the call target name for BodySamples or inlinee's
+// profile name for CallsiteSamples.
+// Matching heuristic:
+// First match all the anchors using the 
diff  algorithm, then split the
+// non-anchor locations between the two anchors evenly, first half are matched
+// based on the start anchor, second half are matched based on the end anchor.
+// For example, given:
+// IR locations:      [1, 2(foo), 3, 5, 6(bar), 7]
+// Profile locations: [1, 2, 3(foo), 4, 7, 8(bar), 9]
+// The matching gives:
+//   [1,    2(foo), 3,  5,  6(bar), 7]
+//    |     |       |   |     |     |
+//   [1, 2, 3(foo), 4,  7,  8(bar), 9]
+// The output mapping: [2->3, 3->4, 5->7, 6->8, 7->9].
+void SampleProfileMatcher::runStaleProfileMatching(
+    const Function &F, const AnchorMap &IRAnchors,
+    const AnchorMap &ProfileAnchors, LocToLocMap &IRToProfileLocationMap) {
+  LLVM_DEBUG(dbgs() << "Run stale profile matching for " << F.getName()
+                    << "\n");
+  assert(IRToProfileLocationMap.empty() &&
+         "Run stale profile matching only once per function");
+
+  AnchorList FilteredProfileAnchorList;
+  for (const auto &I : ProfileAnchors)
+    FilteredProfileAnchorList.emplace_back(I);
+
+  AnchorList FilteredIRAnchorsList;
+  // Filter the non-callsite from IRAnchors.
+  for (const auto &I : IRAnchors) {
+    if (I.second.stringRef().empty())
+      continue;
+    FilteredIRAnchorsList.emplace_back(I);
+  }
+
+  if (FilteredIRAnchorsList.empty() || FilteredProfileAnchorList.empty())
+    return;
+
+  // Match the callsite anchors by finding the longest common subsequence
+  // between IR and profile. Note that we need to use IR anchor as base(A side)
+  // to align with the order of IRToProfileLocationMap.
+  LocToLocMap MatchedAnchors =
+      longestCommonSequence(FilteredIRAnchorsList, FilteredProfileAnchorList);
+
+  // Match the non-callsite locations and write the result to
+  // IRToProfileLocationMap.
+  matchNonCallsiteLocs(MatchedAnchors, IRAnchors, IRToProfileLocationMap);
+}
+
 void SampleProfileMatcher::runOnFunction(Function &F) {
   // We need to use flattened function samples for matching.
   // Unlike IR, which includes all callsites from the source code, the callsites
@@ -238,11 +325,11 @@ void SampleProfileMatcher::runOnFunction(Function &F) {
   // Anchors for IR. It's a map from IR location to callee name, callee name is
   // empty for non-call instruction and use a dummy name(UnknownIndirectCallee)
   // for unknown indrect callee name.
-  std::map<LineLocation, StringRef> IRAnchors;
+  AnchorMap IRAnchors;
   findIRAnchors(F, IRAnchors);
   // Anchors for profile. It's a map from callsite location to a set of callee
   // name.
-  std::map<LineLocation, std::unordered_set<FunctionId>> ProfileAnchors;
+  AnchorMap ProfileAnchors;
   findProfileAnchors(*FSFlattened, ProfileAnchors);
 
   // Compute the callsite match states for profile staleness report.
@@ -274,9 +361,8 @@ void SampleProfileMatcher::runOnFunction(Function &F) {
 }
 
 void SampleProfileMatcher::recordCallsiteMatchStates(
-    const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
-    const std::map<LineLocation, std::unordered_set<FunctionId>>
-        &ProfileAnchors,
+    const Function &F, const AnchorMap &IRAnchors,
+    const AnchorMap &ProfileAnchors,
     const LocToLocMap *IRToProfileLocationMap) {
   bool IsPostMatch = IRToProfileLocationMap != nullptr;
   auto &CallsiteMatchStates =
@@ -297,23 +383,12 @@ void SampleProfileMatcher::recordCallsiteMatchStates(
     // After fuzzy profile matching, use the matching result to remap the
     // current IR callsite.
     const auto &ProfileLoc = MapIRLocToProfileLoc(I.first);
-    const auto &IRCalleeName = I.second;
+    const auto &IRCalleeId = I.second;
     const auto &It = ProfileAnchors.find(ProfileLoc);
     if (It == ProfileAnchors.end())
       continue;
-    const auto &Callees = It->second;
-
-    bool IsCallsiteMatched = false;
-    // Since indirect call does not have CalleeName, check conservatively if
-    // callsite in the profile is a callsite location. This is to reduce num of
-    // false positive since otherwise all the indirect call samples will be
-    // reported as mismatching.
-    if (IRCalleeName == SampleProfileMatcher::UnknownIndirectCallee)
-      IsCallsiteMatched = true;
-    else if (Callees.size() == 1 && Callees.count(getRepInFormat(IRCalleeName)))
-      IsCallsiteMatched = true;
-
-    if (IsCallsiteMatched) {
+    const auto &ProfCalleeId = It->second;
+    if (IRCalleeId == ProfCalleeId) {
       auto It = CallsiteMatchStates.find(ProfileLoc);
       if (It == CallsiteMatchStates.end())
         CallsiteMatchStates.emplace(ProfileLoc, MatchState::InitialMatch);
@@ -330,8 +405,7 @@ void SampleProfileMatcher::recordCallsiteMatchStates(
   // IR callsites.
   for (const auto &I : ProfileAnchors) {
     const auto &Loc = I.first;
-    [[maybe_unused]] const auto &Callees = I.second;
-    assert(!Callees.empty() && "Callees should not be empty");
+    assert(!I.second.stringRef().empty() && "Callees should not be empty");
     auto It = CallsiteMatchStates.find(Loc);
     if (It == CallsiteMatchStates.end())
       CallsiteMatchStates.emplace(Loc, MatchState::InitialMismatch);

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-matching-LCS.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-matching-LCS.prof
new file mode 100644
index 0000000000000..e56c7c01865d1
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-matching-LCS.prof
@@ -0,0 +1,26 @@
+test_direct_call:606:83
+ 1: 83
+ 2: 83 C:83
+ 3: 90 B:90
+ 4: 83 A:83
+ 5: 92 B:92
+ 6: 83 A:83
+ 7: 97 C:97
+ !CFGChecksum: 123456
+test_indirect_call:589:86
+ 1: 86
+ 2: 86 C:86
+ 3: 83 A:43 B:40
+ 4: 84 B:84
+ 6: 82 B:62 A:20
+ 7: 91 C:91
+ !CFGChecksum: 123456
+main:403:0
+ 1: 0
+ 2: 80
+ 3: 80
+ 4: 86 test_indirect_call:86
+ 5: 83 test_direct_call:83
+ 6: 83
+ 7: 0
+ !CFGChecksum: 563036051115663

diff  --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll
new file mode 100644
index 0000000000000..ecf8484d98e59
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll
@@ -0,0 +1,219 @@
+; REQUIRES: x86_64-linux
+; REQUIRES: asserts
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching-LCS.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl 2>&1 | FileCheck %s
+
+; CHECK: Run stale profile matching for test_direct_call
+; CHECK: Location is matched from 1 to 1
+; CHECK: Location is matched from 2 to 2
+; CHECK: Location is matched from 3 to 3
+; CHECK: Callsite with callee:C is matched from 4 to 2
+; CHECK: Location is rematched backwards from 3 to 1
+; CHECK: Callsite with callee:A is matched from 5 to 4
+; CHECK: Callsite with callee:B is matched from 6 to 5
+; CHECK: Location is matched from 7 to 6
+; CHECK: Callsite with callee:A is matched from 8 to 6
+
+; CHECK: Run stale profile matching for test_indirect_call
+; CHECK: Location is matched from 1 to 1
+; CHECK: Location is matched from 2 to 2
+; CHECK: Location is matched from 3 to 3
+; CHECK: Location is matched from 4 to 4
+; CHECK: Callsite with callee:C is matched from 5 to 2
+; CHECK: Location is rematched backwards from 3 to 0
+; CHECK: Location is rematched backwards from 4 to 1
+; CHECK: Callsite with callee:unknown.indirect.callee is matched from 6 to 3
+; CHECK:Callsite with callee:B is matched from 7 to 4
+; CHECK: Location is matched from 8 to 5
+; CHECK: Callsite with callee:unknown.indirect.callee is matched from 9 to 6
+; CHECK: Callsite with callee:C is matched from 10 to 7
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at c = external global i32, align 4
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @test_direct_call(i32 noundef %x) #0 !dbg !12 {
+entry:
+    #dbg_value(i32 %x, !17, !DIExpression(), !18)
+  call void @llvm.pseudoprobe(i64 -4364451034228175269, i64 1, i32 0, i64 -1), !dbg !19
+  %call = call i32 @A(i32 noundef %x), !dbg !20
+  %add = add nsw i32 %x, %call, !dbg !22
+    #dbg_value(i32 %add, !17, !DIExpression(), !18)
+  %call1 = call i32 @B(i32 noundef %add), !dbg !23
+  %add2 = add nsw i32 %add, %call1, !dbg !25
+    #dbg_value(i32 %add2, !17, !DIExpression(), !18)
+  %call3 = call i32 @C(i32 noundef %add2), !dbg !26
+  %add4 = add nsw i32 %add2, %call3, !dbg !28
+    #dbg_value(i32 %add4, !17, !DIExpression(), !18)
+  %call5 = call i32 @A(i32 noundef %add4), !dbg !29
+  %add6 = add nsw i32 %add4, %call5, !dbg !31
+    #dbg_value(i32 %add6, !17, !DIExpression(), !18)
+  %call7 = call i32 @B(i32 noundef %add6), !dbg !32
+  %add8 = add nsw i32 %add6, %call7, !dbg !34
+    #dbg_value(i32 %add8, !17, !DIExpression(), !18)
+  %call9 = call i32 @B(i32 noundef %add8), !dbg !35
+  %add10 = add nsw i32 %add8, %call9, !dbg !37
+    #dbg_value(i32 %add10, !17, !DIExpression(), !18)
+  %call11 = call i32 @A(i32 noundef %add10), !dbg !38
+  %add12 = add nsw i32 %add10, %call11, !dbg !40
+    #dbg_value(i32 %add12, !17, !DIExpression(), !18)
+  ret i32 %add12, !dbg !41
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare !dbg !42 i32 @A(i32 noundef) #2
+
+declare !dbg !43 i32 @B(i32 noundef) #2
+
+declare !dbg !44 i32 @C(i32 noundef) #2
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @test_indirect_call(i32 noundef %x) #0 !dbg !45 {
+entry:
+    #dbg_value(i32 %x, !47, !DIExpression(), !50)
+  call void @llvm.pseudoprobe(i64 -8563147518712133441, i64 1, i32 0, i64 -1), !dbg !51
+  %0 = load i32, ptr @c, align 4, !dbg !51, !tbaa !53
+  %tobool = icmp ne i32 %0, 0, !dbg !51
+  br i1 %tobool, label %if.then, label %if.else, !dbg !57
+
+if.then:                                          ; preds = %entry
+  call void @llvm.pseudoprobe(i64 -8563147518712133441, i64 2, i32 0, i64 -1), !dbg !58
+    #dbg_value(ptr @A, !48, !DIExpression(), !50)
+  br label %if.end, !dbg !59
+
+if.else:                                          ; preds = %entry
+  call void @llvm.pseudoprobe(i64 -8563147518712133441, i64 3, i32 0, i64 -1), !dbg !60
+    #dbg_value(ptr @B, !48, !DIExpression(), !50)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %fp.0 = phi ptr [ @A, %if.then ], [ @B, %if.else ], !dbg !61
+    #dbg_value(ptr %fp.0, !48, !DIExpression(), !50)
+  call void @llvm.pseudoprobe(i64 -8563147518712133441, i64 4, i32 0, i64 -1), !dbg !62
+  %call = call i32 @C(i32 noundef %x), !dbg !63
+  %add = add nsw i32 %x, %call, !dbg !65
+    #dbg_value(i32 %add, !47, !DIExpression(), !50)
+  %call1 = call i32 %fp.0(i32 noundef %add), !dbg !66
+  %add2 = add nsw i32 %add, %call1, !dbg !68
+    #dbg_value(i32 %add2, !47, !DIExpression(), !50)
+  %call3 = call i32 @B(i32 noundef %add2), !dbg !69
+  %add4 = add nsw i32 %add2, %call3, !dbg !71
+    #dbg_value(i32 %add4, !47, !DIExpression(), !50)
+  %call5 = call i32 @C(i32 noundef %add4), !dbg !72
+  %add6 = add nsw i32 %add4, %call5, !dbg !74
+    #dbg_value(i32 %add6, !47, !DIExpression(), !50)
+  %call7 = call i32 %fp.0(i32 noundef %add6), !dbg !75
+  %add8 = add nsw i32 %add6, %call7, !dbg !77
+    #dbg_value(i32 %add8, !47, !DIExpression(), !50)
+  %call9 = call i32 @C(i32 noundef %add8), !dbg !78
+  %add10 = add nsw i32 %add8, %call9, !dbg !80
+    #dbg_value(i32 %add10, !47, !DIExpression(), !50)
+  ret i32 %add10, !dbg !81
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #4
+
+attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn }
+attributes #2 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+!llvm.ident = !{!9}
+!llvm.pseudo_probe_desc = !{!10, !11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/home/", checksumkind: CSK_MD5, checksum: "be98aa946f37f0ad8d307c9121efe101")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!9 = !{!"clang version 19.0.0"}
+!10 = !{i64 -4364451034228175269, i64 1970329131941887, !"test_direct_call"}
+!11 = !{i64 -8563147518712133441, i64 1688922477484692, !"test_indirect_call"}
+!12 = distinct !DISubprogram(name: "test_direct_call", scope: !1, file: !1, line: 10, type: !13, scopeLine: 10, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16)
+!13 = !DISubroutineType(types: !14)
+!14 = !{!15, !15}
+!15 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!16 = !{!17}
+!17 = !DILocalVariable(name: "x", arg: 1, scope: !12, file: !1, line: 10, type: !15)
+!18 = !DILocation(line: 0, scope: !12)
+!19 = !DILocation(line: 11, column: 10, scope: !12)
+!20 = !DILocation(line: 11, column: 8, scope: !21)
+!21 = !DILexicalBlockFile(scope: !12, file: !1, discriminator: 186646551)
+!22 = !DILocation(line: 11, column: 5, scope: !12)
+!23 = !DILocation(line: 12, column: 8, scope: !24)
+!24 = !DILexicalBlockFile(scope: !12, file: !1, discriminator: 186646559)
+!25 = !DILocation(line: 12, column: 5, scope: !12)
+!26 = !DILocation(line: 13, column: 8, scope: !27)
+!27 = !DILexicalBlockFile(scope: !12, file: !1, discriminator: 186646567)
+!28 = !DILocation(line: 13, column: 5, scope: !12)
+!29 = !DILocation(line: 14, column: 8, scope: !30)
+!30 = !DILexicalBlockFile(scope: !12, file: !1, discriminator: 186646575)
+!31 = !DILocation(line: 14, column: 5, scope: !12)
+!32 = !DILocation(line: 15, column: 8, scope: !33)
+!33 = !DILexicalBlockFile(scope: !12, file: !1, discriminator: 186646583)
+!34 = !DILocation(line: 15, column: 5, scope: !12)
+!35 = !DILocation(line: 16, column: 8, scope: !36)
+!36 = !DILexicalBlockFile(scope: !12, file: !1, discriminator: 186646591)
+!37 = !DILocation(line: 16, column: 5, scope: !12)
+!38 = !DILocation(line: 17, column: 8, scope: !39)
+!39 = !DILexicalBlockFile(scope: !12, file: !1, discriminator: 186646599)
+!40 = !DILocation(line: 17, column: 5, scope: !12)
+!41 = !DILocation(line: 18, column: 3, scope: !12)
+!42 = !DISubprogram(name: "A", scope: !1, file: !1, line: 2, type: !13, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!43 = !DISubprogram(name: "B", scope: !1, file: !1, line: 3, type: !13, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!44 = !DISubprogram(name: "C", scope: !1, file: !1, line: 4, type: !13, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!45 = distinct !DISubprogram(name: "test_indirect_call", scope: !1, file: !1, line: 21, type: !13, scopeLine: 21, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !46)
+!46 = !{!47, !48}
+!47 = !DILocalVariable(name: "x", arg: 1, scope: !45, file: !1, line: 21, type: !15)
+!48 = !DILocalVariable(name: "fp", scope: !45, file: !1, line: 22, type: !49)
+!49 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64)
+!50 = !DILocation(line: 0, scope: !45)
+!51 = !DILocation(line: 23, column: 6, scope: !52)
+!52 = distinct !DILexicalBlock(scope: !45, file: !1, line: 23, column: 6)
+!53 = !{!54, !54, i64 0}
+!54 = !{!"int", !55, i64 0}
+!55 = !{!"omnipotent char", !56, i64 0}
+!56 = !{!"Simple C/C++ TBAA"}
+!57 = !DILocation(line: 23, column: 6, scope: !45)
+!58 = !DILocation(line: 24, column: 8, scope: !52)
+!59 = !DILocation(line: 24, column: 5, scope: !52)
+!60 = !DILocation(line: 26, column: 8, scope: !52)
+!61 = !DILocation(line: 0, scope: !52)
+!62 = !DILocation(line: 27, column: 10, scope: !45)
+!63 = !DILocation(line: 27, column: 8, scope: !64)
+!64 = !DILexicalBlockFile(scope: !45, file: !1, discriminator: 186646575)
+!65 = !DILocation(line: 27, column: 5, scope: !45)
+!66 = !DILocation(line: 28, column: 8, scope: !67)
+!67 = !DILexicalBlockFile(scope: !45, file: !1, discriminator: 119537719)
+!68 = !DILocation(line: 28, column: 5, scope: !45)
+!69 = !DILocation(line: 29, column: 8, scope: !70)
+!70 = !DILexicalBlockFile(scope: !45, file: !1, discriminator: 186646591)
+!71 = !DILocation(line: 29, column: 5, scope: !45)
+!72 = !DILocation(line: 30, column: 8, scope: !73)
+!73 = !DILexicalBlockFile(scope: !45, file: !1, discriminator: 186646599)
+!74 = !DILocation(line: 30, column: 5, scope: !45)
+!75 = !DILocation(line: 31, column: 8, scope: !76)
+!76 = !DILexicalBlockFile(scope: !45, file: !1, discriminator: 119537743)
+!77 = !DILocation(line: 31, column: 5, scope: !45)
+!78 = !DILocation(line: 32, column: 8, scope: !79)
+!79 = !DILexicalBlockFile(scope: !45, file: !1, discriminator: 186646615)
+!80 = !DILocation(line: 32, column: 5, scope: !45)
+!81 = !DILocation(line: 33, column: 3, scope: !45)

diff  --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll
index 0d471e43d2a72..20be0c2fec7f2 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll
@@ -86,7 +86,7 @@
 ; CHECK:    3:  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 3, i32 0, i64 -1), !dbg ![[#]] - weight: 13 - factor: 1.00)
 ; CHECK:    6:  %call1.i5 = call i32 @bar(i32 noundef %add.i4), !dbg ![[#]] - weight: 13 - factor: 1.00)
 ; CHECK:    4:  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg ![[#]] - weight: 112 - factor: 1.00)
-; CHECK:    14:  %call2 = call i32 @bar(i32 noundef %3), !dbg ![[#]] - weight: 124 - factor: 1.00)
+; CHECK:    14: %call2 = call i32 @bar(i32 noundef %3), !dbg ![[#]] - weight: 124 - factor: 1.00)
 ; CHECK:    8:  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 8, i32 0, i64 -1), !dbg ![[#]] - weight: 0 - factor: 1.00)
 ; CHECK:    1:  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1), !dbg ![[#]] - weight: 117 - factor: 1.00)
 ; CHECK:    2:  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1), !dbg ![[#]] - weight: 104 - factor: 1.00)


        


More information about the llvm-commits mailing list