[llvm] dc3f8c2 - [memprof] Improve deserialization performance in V3 (#94787)

Fri Jun 7 17:26:01 PDT 2024

Author: Kazu Hirata
Date: 2024-06-07T17:25:57-07:00
New Revision: dc3f8c2f587e9647d1ce86426c2cf317c98f453c

URL: https://github.com/llvm/llvm-project/commit/dc3f8c2f587e9647d1ce86426c2cf317c98f453c
DIFF: https://github.com/llvm/llvm-project/commit/dc3f8c2f587e9647d1ce86426c2cf317c98f453c.diff

LOG: [memprof] Improve deserialization performance in V3 (#94787)

We call llvm::sort in a couple of places in the V3 encoding:

- We sort Frames by FrameIds for stability of the output.

- We sort call stacks in the dictionary order to maximize the length
  of the common prefix between adjacent call stacks.

It turns out that we can improve the deserialization performance by
modifying the comparison functions -- without changing the format at
all.  Both places take advantage of the histogram of Frames -- how
many times each Frame occurs in the call stacks.

- Frames: We serialize popular Frames in the descending order of
  popularity for improved cache locality.  For two equally popular
  Frames, we break a tie by serializing one that tends to appear
  earlier in call stacks.  Here, "earlier" means a smaller index
  within llvm::SmallVector<FrameId>.

- Call Stacks: We sort the call stacks to reduce the number of times
  we follow pointers to parents during deserialization.  Specifically,
  instead of comparing two call stacks in the strcmp style -- integer
  comparisons of FrameIds, we compare two FrameIds F1 and F2 with
  Histogram[F1] < Histogram[F2] at respective indexes.  Since we
  encode from the end of the sorted list of call stacks, we tend to
  encode popular call stacks first.

Since the two places use the same histogram, we compute it once and
share it in the two places.

Sorting the call stacks reduces the number of "jumps" by 74% when we
deserialize all MemProfRecords.  The cycle and instruction counts go
down by 10% and 1.5%, respectively.

If we sort the Frames in addition to the call stacks, then the cycle
and instruction counts go down by 14% and 1.6%, respectively, relative
to the same baseline (that is, without this patch).

Added: 
    

Modified: 
    llvm/include/llvm/ProfileData/MemProf.h
    llvm/lib/ProfileData/InstrProfWriter.cpp
    llvm/lib/ProfileData/MemProf.cpp
    llvm/unittests/ProfileData/MemProfTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 0e4bb9cc3c6cb..8f5ba9c333320 100644

--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -932,6 +932,18 @@ struct IndexedMemProfData {
   llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> CallStackData;
 };
 
+struct FrameStat {
+  // The number of occurrences of a given FrameId.
+  uint64_t Count = 0;
+  // The sum of indexes where a given FrameId shows up.
+  uint64_t PositionSum = 0;
+};
+
+// Compute a histogram of Frames in call stacks.
+llvm::DenseMap<FrameId, FrameStat>
+computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+                          &MemProfCallStackData);
+
 // Construct a radix tree of call stacks.
 //
 // A set of call stacks might look like:
@@ -1027,9 +1039,11 @@ class CallStackRadixTreeBuilder {
   CallStackRadixTreeBuilder() = default;
 
   // Build a radix tree array.
-  void build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
-                 &&MemProfCallStackData,
-             const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes);
+  void
+  build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+            &&MemProfCallStackData,
+        const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
+        llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram);
 
   const std::vector<LinearFrameId> &getRadixArray() const { return RadixArray; }
 

diff  --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 7d7c980a9e11f..1a9add109a360 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -494,17 +494,40 @@ static uint64_t writeMemProfFrames(
 static llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
 writeMemProfFrameArray(
     ProfOStream &OS,
-    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData,
+    llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
   // Mappings from FrameIds to array indexes.
   llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId> MemProfFrameIndexes;
 
-  // Sort the FrameIDs for stability.
+  // Compute the order in which we serialize Frames.  The order does not matter
+  // in terms of correctness, but we still compute it for deserialization
+  // performance.  Specifically, if we serialize frequently used Frames one
+  // after another, we have better cache utilization.  For two Frames that
+  // appear equally frequently, we break a tie by serializing the one that tends
+  // to appear earlier in call stacks.  We implement the tie-breaking mechanism
+  // by computing the sum of indexes within call stacks for each Frame.  If we
+  // still have a tie, then we just resort to compare two FrameIds, which is
+  // just for stability of output.
   std::vector<std::pair<memprof::FrameId, const memprof::Frame *>> FrameIdOrder;
   FrameIdOrder.reserve(MemProfFrameData.size());
   for (const auto &[Id, Frame] : MemProfFrameData)
     FrameIdOrder.emplace_back(Id, &Frame);
   assert(MemProfFrameData.size() == FrameIdOrder.size());
-  llvm::sort(FrameIdOrder);
+  llvm::sort(FrameIdOrder,
+             [&](const std::pair<memprof::FrameId, const memprof::Frame *> &L,
+                 const std::pair<memprof::FrameId, const memprof::Frame *> &R) {
+               const auto &SL = FrameHistogram[L.first];
+               const auto &SR = FrameHistogram[R.first];
+               // Popular FrameIds should come first.
+               if (SL.Count != SR.Count)
+                 return SL.Count > SR.Count;
+               // If they are equally popular, then the one that tends to appear
+               // earlier in call stacks should come first.
+               if (SL.PositionSum != SR.PositionSum)
+                 return SL.PositionSum < SR.PositionSum;
+               // Compare their FrameIds for sort stability.
+               return L.first < R.first;
+             });
 
   // Serialize all frames while creating mappings from linear IDs to FrameIds.
   uint64_t Index = 0;
@@ -543,12 +566,14 @@ writeMemProfCallStackArray(
     llvm::MapVector<memprof::CallStackId, llvm::SmallVector<memprof::FrameId>>
         &MemProfCallStackData,
     llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
-        &MemProfFrameIndexes) {
+        &MemProfFrameIndexes,
+    llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
   llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
       MemProfCallStackIndexes;
 
   memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   for (auto I : Builder.getRadixArray())
     OS.write32(I);
   MemProfCallStackIndexes = Builder.takeCallStackPos();
@@ -704,13 +729,17 @@ static Error writeMemProfV3(ProfOStream &OS,
     Schema = memprof::getFullSchema();
   writeMemProfSchema(OS, Schema);
 
+  llvm::DenseMap<memprof::FrameId, memprof::FrameStat> FrameHistogram =
+      memprof::computeFrameHistogram(MemProfData.CallStackData);
+  assert(MemProfData.FrameData.size() == FrameHistogram.size());
+
   llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId> MemProfFrameIndexes =
-      writeMemProfFrameArray(OS, MemProfData.FrameData);
+      writeMemProfFrameArray(OS, MemProfData.FrameData, FrameHistogram);
 
   uint64_t CallStackPayloadOffset = OS.tell();
   llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
       MemProfCallStackIndexes = writeMemProfCallStackArray(
-          OS, MemProfData.CallStackData, MemProfFrameIndexes);
+          OS, MemProfData.CallStackData, MemProfFrameIndexes, FrameHistogram);
 
   uint64_t RecordPayloadOffset = OS.tell();
   uint64_t RecordTableOffset =

diff  --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 620e2e2d71a0f..8e3053748c087 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -486,7 +486,8 @@ LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
 void CallStackRadixTreeBuilder::build(
     llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
         &&MemProfCallStackData,
-    const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes) {
+    const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
+    llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
   // Take the vector portion of MemProfCallStackData.  The vector is exactly
   // what we need to sort.  Also, we no longer need its lookup capability.
   llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
@@ -498,14 +499,56 @@ void CallStackRadixTreeBuilder::build(
     return;
   }
 
-  // Sort the list of call stacks in the dictionary order to maximize the length
-  // of the common prefix between two adjacent call stacks.
+  // Sorting the list of call stacks in the dictionary order is sufficient to
+  // maximize the length of the common prefix between two adjacent call stacks
+  // and thus minimize the length of RadixArray.  However, we go one step
+  // further and try to reduce the number of times we follow pointers to parents
+  // during deserilization.  Consider a poorly encoded radix tree:
+  //
+  // CallStackId 1:  f1 -> f2 -> f3
+  //                  |
+  // CallStackId 2:   +--- f4 -> f5
+  //                        |
+  // CallStackId 3:         +--> f6
+  //
+  // Here, f2 and f4 appear once and twice, respectively, in the call stacks.
+  // Once we encode CallStackId 1 into RadixArray, every other call stack with
+  // common prefix f1 ends up pointing to CallStackId 1.  Since CallStackId 3
+  // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
+  // parents twice.
+  //
+  // We try to alleviate the situation by sorting the list of call stacks by
+  // comparing the popularity of frames rather than the integer values of
+  // FrameIds.  In the example above, f4 is more popular than f2, so we sort the
+  // call stacks and encode them as:
+  //
+  // CallStackId 2:  f1 -- f4 -> f5
+  //                  |     |
+  // CallStackId 3:   |     +--> f6
+  //                  |
+  // CallStackId 1:   +--> f2 -> f3
+  //
+  // Notice that CallStackId 3 follows a pointer to a parent only once.
+  //
+  // All this is a quick-n-dirty trick to reduce the number of jumps.  The
+  // proper way would be to compute the weight of each radix tree node -- how
+  // many call stacks use a given radix tree node, and encode a radix tree from
+  // the heaviest node first.  We do not do so because that's a lot of work.
   llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
     // Call stacks are stored from leaf to root.  Perform comparisons from the
     // root.
     return std::lexicographical_compare(
         L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
-        [&](FrameId F1, FrameId F2) { return F1 < F2; });
+        [&](FrameId F1, FrameId F2) {
+          uint64_t H1 = FrameHistogram[F1].Count;
+          uint64_t H2 = FrameHistogram[F2].Count;
+          // Popular frames should come later because we encode call stacks from
+          // the last one in the list.
+          if (H1 != H2)
+            return H1 < H2;
+          // For sort stability.
+          return F1 < F2;
+        });
   });
 
   // Reserve some reasonable amount of storage.
@@ -569,6 +612,22 @@ void CallStackRadixTreeBuilder::build(
     V = RadixArray.size() - 1 - V;
 }
 
+llvm::DenseMap<FrameId, FrameStat>
+computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+                          &MemProfCallStackData) {
+  llvm::DenseMap<FrameId, FrameStat> Histogram;
+
+  for (const auto &KV : MemProfCallStackData) {
+    const auto &CS = KV.second;
+    for (unsigned I = 0, E = CS.size(); I != E; ++I) {
+      auto &S = Histogram[CS[I]];
+      ++S.Count;
+      S.PositionSum += I;
+    }
+  }
+  return Histogram;
+}
+
 void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) {
   for (const auto &AS : Record.AllocSites) {
     assert(AS.CSId == hashCallStack(AS.CallStack));

diff  --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 26421200e1a11..15eb59ee00c94 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -667,8 +667,12 @@ TEST(MemProf, MissingFrameId) {
 TEST(MemProf, RadixTreeBuilderEmpty) {
   llvm::DenseMap<FrameId, llvm::memprof::LinearFrameId> MemProfFrameIndexes;
   llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
+  llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
+      FrameHistogram =
+          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   ASSERT_THAT(Builder.getRadixArray(), testing::IsEmpty());
   const auto Mappings = Builder.takeCallStackPos();
   ASSERT_THAT(Mappings, testing::IsEmpty());
@@ -681,8 +685,12 @@ TEST(MemProf, RadixTreeBuilderOne) {
   llvm::SmallVector<llvm::memprof::FrameId> CS1 = {13, 12, 11};
   llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1});
+  llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
+      FrameHistogram =
+          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({
                                            3U, // Size of CS1,
                                            3U, // MemProfFrameIndexes[13]
@@ -704,8 +712,12 @@ TEST(MemProf, RadixTreeBuilderTwo) {
   llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1});
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2});
+  llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
+      FrameHistogram =
+          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
               testing::ElementsAreArray({
                   2U,                        // Size of CS1
@@ -738,8 +750,12 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) {
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2});
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS3), CS3});
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS4), CS4});
+  llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
+      FrameHistogram =
+          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
               testing::ElementsAreArray({
                   4U,                        // Size of CS1