[llvm] [InstrProf] Evaluate function order using test traces (PR #92451)

Mon May 20 09:42:44 PDT 2024

https://github.com/ellishg updated https://github.com/llvm/llvm-project/pull/92451

>From 39b7dcaf3c26ca200dfb0010f8424126205c9899 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag at gmail.com>
Date: Thu, 16 May 2024 11:43:10 -0700
Subject: [PATCH 1/2] [InstrProf] Evaluate function order using test traces

The `llvm-profdata order` command is used to compute a function order
using traces from the input profile. Add the `--num-test-traces` flag to
keep aside N traces to evalute this order. These test traces are assumed
to be the actual function execution order in some experiment. The output
is a number that represents how many page faults we got. Lower is
better.

I tested on a large profile I already had.
```
llvm-profdata order default.profdata --num-test-traces=30
\# Ordered 149103 functions
\# Total area under the page fault curve: 2.271827e+09
...
```

I also improved `TemporalProfTraceTy::createBPFunctionNodes()` in a few
ways:
* Simplified how `UN`s are computed
* Output vector is not an aliased argument instead of a return

These changes improved the evaluation in my test.
```
llvm-profdata order default.profdata --num-test-traces=30
\# Ordered 149103 functions
\# Total area under the page fault curve: 2.268586e+09
...
```
---
 llvm/include/llvm/ProfileData/InstrProf.h     |  4 +-
 llvm/lib/ProfileData/InstrProf.cpp            | 73 +++++++++++--------
 .../llvm-profdata/show-order-error.proftext   | 27 +++++++
 .../tools/llvm-profdata/show-order.proftext   | 11 ++-
 llvm/tools/llvm-profdata/llvm-profdata.cpp    | 42 ++++++++++-
 .../ProfileData/BPFunctionNodeTest.cpp        | 31 ++++----
 6 files changed, 132 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/tools/llvm-profdata/show-order-error.proftext

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 88c7fe425b5a5..b06ec5283a78b 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -385,8 +385,8 @@ struct TemporalProfTraceTy {
   /// Use a set of temporal profile traces to create a list of balanced
   /// partitioning function nodes used by BalancedPartitioning to generate a
   /// function order that reduces page faults during startup
-  static std::vector<BPFunctionNode>
-  createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces);
+  static void createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces,
+                                    std::vector<BPFunctionNode> &Nodes);
 };
 
 inline std::error_code make_error_code(instrprof_error E) {
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 806d01de1ada5..220d7e01f7ccd 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1002,46 +1002,57 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
     ValueSites.emplace_back(VData, VData + N);
 }
 
-std::vector<BPFunctionNode> TemporalProfTraceTy::createBPFunctionNodes(
-    ArrayRef<TemporalProfTraceTy> Traces) {
+void TemporalProfTraceTy::createBPFunctionNodes(
+    ArrayRef<TemporalProfTraceTy> Traces, std::vector<BPFunctionNode> &Nodes) {
   using IDT = BPFunctionNode::IDT;
   using UtilityNodeT = BPFunctionNode::UtilityNodeT;
-  // Collect all function IDs ordered by their smallest timestamp. This will be
-  // used as the initial FunctionNode order.
-  SetVector<IDT> FunctionIds;
-  size_t LargestTraceSize = 0;
-  for (auto &Trace : Traces)
-    LargestTraceSize =
-        std::max(LargestTraceSize, Trace.FunctionNameRefs.size());
-  for (size_t Timestamp = 0; Timestamp < LargestTraceSize; Timestamp++)
-    for (auto &Trace : Traces)
-      if (Timestamp < Trace.FunctionNameRefs.size())
-        FunctionIds.insert(Trace.FunctionNameRefs[Timestamp]);
-
-  const int N = Log2_64(LargestTraceSize) + 1;
-
+  UtilityNodeT MaxUN = 0;
+  DenseMap<IDT, size_t> IdToFirstTimestamp;
+  DenseMap<IDT, UtilityNodeT> IdToFirstUN;
+  DenseMap<IDT, SmallVector<UtilityNodeT>> IdToUNs;
   // TODO: We need to use the Trace.Weight field to give more weight to more
   // important utilities
-  DenseMap<IDT, SmallVector<UtilityNodeT, 4>> FuncGroups;
-  for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
-    auto &Trace = Traces[TraceIdx].FunctionNameRefs;
-    for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) {
-      for (int I = Log2_64(Timestamp + 1); I < N; I++) {
-        auto FunctionId = Trace[Timestamp];
-        UtilityNodeT GroupId = TraceIdx * N + I;
-        FuncGroups[FunctionId].push_back(GroupId);
+  for (auto &Trace : Traces) {
+    size_t CutoffTimestamp = 1;
+    for (size_t Timestamp = 0; Timestamp < Trace.FunctionNameRefs.size();
+         Timestamp++) {
+      IDT Id = Trace.FunctionNameRefs[Timestamp];
+      auto [It, WasInserted] = IdToFirstTimestamp.try_emplace(Id, Timestamp);
+      if (!WasInserted)
+        It->getSecond() = std::min<size_t>(It->getSecond(), Timestamp);
+      if (Timestamp >= CutoffTimestamp) {
+        ++MaxUN;
+        CutoffTimestamp = 2 * Timestamp;
       }
+      IdToFirstUN.try_emplace(Id, MaxUN);
     }
+    for (auto &[Id, FirstUN] : IdToFirstUN)
+      for (auto UN = FirstUN; UN <= MaxUN; ++UN)
+        IdToUNs[Id].push_back(UN);
+    ++MaxUN;
+    IdToFirstUN.clear();
   }
 
-  std::vector<BPFunctionNode> Nodes;
-  for (auto Id : FunctionIds) {
-    auto &UNs = FuncGroups[Id];
-    llvm::sort(UNs);
-    UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end());
+  DenseMap<UtilityNodeT, unsigned> UNFrequency;
+  for (auto &[Id, UNs] : IdToUNs)
+    for (auto &UN : UNs)
+      ++UNFrequency[UN];
+  // Filter out rare and common utility nodes to make BalancedPartitioning more
+  // effective.
+  for (auto &[Id, UNs] : IdToUNs)
+    llvm::erase_if(UNs, [&](auto &UN) {
+      return UNFrequency[UN] <= 1 || 2 * UNFrequency[UN] > IdToUNs.size();
+    });
+
+  for (auto &[Id, UNs] : IdToUNs)
     Nodes.emplace_back(Id, UNs);
-  }
-  return Nodes;
+
+  // Since BalancedPartitioning is sensitive to the initial order, we explicitly
+  // order nodes by their earliest timestamp.
+  llvm::sort(Nodes, [&](auto &L, auto &R) {
+    return std::make_pair(IdToFirstTimestamp[L.Id], L.Id) <
+           std::make_pair(IdToFirstTimestamp[R.Id], R.Id);
+  });
 }
 
 #define INSTR_PROF_COMMON_API_IMPL
diff --git a/llvm/test/tools/llvm-profdata/show-order-error.proftext b/llvm/test/tools/llvm-profdata/show-order-error.proftext
new file mode 100644
index 0000000000000..633f1a9949b6f
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/show-order-error.proftext
@@ -0,0 +1,27 @@
+# RUN: not llvm-profdata order %s --num-test-traces=10 2>&1 | FileCheck %s
+
+# CHECK: --num-test-traces must be smaller than the total number of traces
+
+# Header
+:ir
+:temporal_prof_traces
+# Num Traces
+1
+# Trace Stream Size:
+1
+# Weight
+1
+a, b
+
+a
+# Func Hash:
+0x1234
+# Num Counters:
+1
+# Counter Values:
+101
+
+b
+0x5678
+1
+202
diff --git a/llvm/test/tools/llvm-profdata/show-order.proftext b/llvm/test/tools/llvm-profdata/show-order.proftext
index 8ef26847ad77e..28eb1b9b42af7 100644
--- a/llvm/test/tools/llvm-profdata/show-order.proftext
+++ b/llvm/test/tools/llvm-profdata/show-order.proftext
@@ -1,4 +1,6 @@
-# RUN: llvm-profdata order %s | FileCheck %s
+# RUN: llvm-profdata order %s --num-test-traces=1 | FileCheck %s
+
+# CHECK: # Total area under the page fault curve: 4.000000e+00
 
 # CHECK: a
 # CHECK: b
@@ -9,9 +11,9 @@
 :ir
 :temporal_prof_traces
 # Num Traces
-3
+4
 # Trace Stream Size:
-3
+4
 # Weight
 1
 a, main.c:b, c
@@ -21,6 +23,9 @@ a, x, main.c:b, c
 # Weight
 1
 a, main.c:b, c
+# Weight
+1
+a, main.c:b, c, x
 
 a
 # Func Hash:
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 4126b55576ddc..70294c10d698c 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -340,7 +340,7 @@ cl::opt<unsigned long long> OverlapValueCutoff(
         "profile with max count value greater then the parameter value"),
     cl::sub(OverlapSubcommand));
 
-// Options unique to show subcommand.
+// Options specific to show subcommand.
 cl::opt<bool> ShowCounts("counts", cl::init(false),
                          cl::desc("Show counter values for shown functions"),
                          cl::sub(ShowSubcommand));
@@ -439,6 +439,13 @@ cl::opt<bool> ShowProfileVersion("profile-version", cl::init(false),
                                  cl::desc("Show profile version. "),
                                  cl::sub(ShowSubcommand));
 
+// Options specific to order subcommand.
+cl::opt<unsigned> NumTestTraces(
+    "num-test-traces", cl::init(0),
+    cl::desc("Keep aside <num-test-traces> traces when computing the function "
+             "order and instead use them to evaluate that order"),
+    cl::sub(OrderSubcommand));
+
 // We use this string to indicate that there are
 // multiple static functions map to the same name.
 const std::string DuplicateNameStr = "----";
@@ -3278,13 +3285,42 @@ static int order_main() {
     // Read all entries
     (void)I;
   }
-  auto &Traces = Reader->getTemporalProfTraces();
-  auto Nodes = TemporalProfTraceTy::createBPFunctionNodes(Traces);
+  ArrayRef Traces = Reader->getTemporalProfTraces();
+  if (NumTestTraces && NumTestTraces >= Traces.size())
+    exitWithError(
+        "--" + NumTestTraces.ArgStr +
+        " must be smaller than the total number of traces: expected: < " +
+        Twine(Traces.size()) + ", actual: " + Twine(NumTestTraces));
+  ArrayRef TestTraces = Traces.take_back(NumTestTraces);
+  Traces = Traces.drop_back(NumTestTraces);
+
+  std::vector<BPFunctionNode> Nodes;
+  TemporalProfTraceTy::createBPFunctionNodes(Traces, Nodes);
   BalancedPartitioningConfig Config;
   BalancedPartitioning BP(Config);
   BP.run(Nodes);
 
   OS << "# Ordered " << Nodes.size() << " functions\n";
+  if (!TestTraces.empty()) {
+    // Since we don't know the symbol sizes, we assume 32 functions per page.
+    DenseMap<BPFunctionNode::IDT, unsigned> IdToPageNumber;
+    for (auto &Node : Nodes)
+      IdToPageNumber[Node.Id] = IdToPageNumber.size() / 32;
+
+    SmallSet<unsigned, 0> TouchedPages;
+    unsigned Area = 0;
+    for (auto &Trace : TestTraces) {
+      for (auto Id : Trace.FunctionNameRefs) {
+        auto It = IdToPageNumber.find(Id);
+        if (It == IdToPageNumber.end())
+          continue;
+        TouchedPages.insert(It->getSecond());
+        Area += TouchedPages.size();
+      }
+      TouchedPages.clear();
+    }
+    OS << "# Total area under the page fault curve: " << (float)Area << "\n";
+  }
   OS << "# Warning: Mach-O may prefix symbols with \"_\" depending on the "
         "linkage and this output does not take that into account. Some "
         "post-processing may be required before passing to the linker via "
diff --git a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
index 6af6f1bcdc40a..af0104da3341f 100644
--- a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
+++ b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/BalancedPartitioning.h"
-#include "llvm/Testing/Support/SupportHelpers.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -31,22 +30,20 @@ TEST(BPFunctionNodeTest, Basic) {
                        UnorderedElementsAreArray(UNs)));
   };
 
-  auto Nodes = TemporalProfTraceTy::createBPFunctionNodes({
-      TemporalProfTraceTy({0, 1, 2, 3}),
-  });
-  EXPECT_THAT(Nodes,
-              UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
-                                   NodeIs(2, {1, 2}), NodeIs(3, {2})));
-
-  Nodes = TemporalProfTraceTy::createBPFunctionNodes({
-      TemporalProfTraceTy({0, 1, 2, 3, 4}),
-      TemporalProfTraceTy({4, 2}),
-  });
-
-  EXPECT_THAT(Nodes,
-              UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
-                                   NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}),
-                                   NodeIs(4, {2, 3, 4, 5})));
+  std::vector<BPFunctionNode> Nodes;
+  TemporalProfTraceTy::createBPFunctionNodes(
+      {TemporalProfTraceTy({0, 1, 2, 3})}, Nodes);
+  EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
+                                          NodeIs(2, {}), NodeIs(3, {})));
+
+  Nodes.clear();
+  TemporalProfTraceTy::createBPFunctionNodes(
+      {TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
+      Nodes);
+
+  EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
+                                          NodeIs(2, {5}), NodeIs(3, {}),
+                                          NodeIs(4, {5})));
 }
 
 } // end namespace llvm

>From 3a6064767d9504a4db601fbfc0959b1a64fee38d Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag at gmail.com>
Date: Mon, 20 May 2024 09:14:59 -0700
Subject: [PATCH 2/2] Address review

---
 llvm/include/llvm/ProfileData/InstrProf.h     |  3 ++-
 llvm/lib/ProfileData/InstrProf.cpp            | 25 +++++++++++--------
 llvm/tools/llvm-profdata/llvm-profdata.cpp    | 11 ++++----
 .../ProfileData/BPFunctionNodeTest.cpp        | 20 ++++++++++++---
 4 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index b06ec5283a78b..324e6d27b43a6 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -386,7 +386,8 @@ struct TemporalProfTraceTy {
   /// partitioning function nodes used by BalancedPartitioning to generate a
   /// function order that reduces page faults during startup
   static void createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces,
-                                    std::vector<BPFunctionNode> &Nodes);
+                                    std::vector<BPFunctionNode> &Nodes,
+                                    bool RemoveOutlierUNs = true);
 };
 
 inline std::error_code make_error_code(instrprof_error E) {
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 220d7e01f7ccd..de287ee18fb43 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1003,7 +1003,8 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
 }
 
 void TemporalProfTraceTy::createBPFunctionNodes(
-    ArrayRef<TemporalProfTraceTy> Traces, std::vector<BPFunctionNode> &Nodes) {
+    ArrayRef<TemporalProfTraceTy> Traces, std::vector<BPFunctionNode> &Nodes,
+    bool RemoveOutlierUNs) {
   using IDT = BPFunctionNode::IDT;
   using UtilityNodeT = BPFunctionNode::UtilityNodeT;
   UtilityNodeT MaxUN = 0;
@@ -1033,16 +1034,18 @@ void TemporalProfTraceTy::createBPFunctionNodes(
     IdToFirstUN.clear();
   }
 
-  DenseMap<UtilityNodeT, unsigned> UNFrequency;
-  for (auto &[Id, UNs] : IdToUNs)
-    for (auto &UN : UNs)
-      ++UNFrequency[UN];
-  // Filter out rare and common utility nodes to make BalancedPartitioning more
-  // effective.
-  for (auto &[Id, UNs] : IdToUNs)
-    llvm::erase_if(UNs, [&](auto &UN) {
-      return UNFrequency[UN] <= 1 || 2 * UNFrequency[UN] > IdToUNs.size();
-    });
+  if (RemoveOutlierUNs) {
+    DenseMap<UtilityNodeT, unsigned> UNFrequency;
+    for (auto &[Id, UNs] : IdToUNs)
+      for (auto &UN : UNs)
+        ++UNFrequency[UN];
+    // Filter out utility nodes that are too infrequent or too prevalent to make
+    // BalancedPartitioning more effective.
+    for (auto &[Id, UNs] : IdToUNs)
+      llvm::erase_if(UNs, [&](auto &UN) {
+        return UNFrequency[UN] <= 1 || 2 * UNFrequency[UN] > IdToUNs.size();
+      });
+  }
 
   for (auto &[Id, UNs] : IdToUNs)
     Nodes.emplace_back(Id, UNs);
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 70294c10d698c..0796dc8f1567e 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -440,11 +440,12 @@ cl::opt<bool> ShowProfileVersion("profile-version", cl::init(false),
                                  cl::sub(ShowSubcommand));
 
 // Options specific to order subcommand.
-cl::opt<unsigned> NumTestTraces(
-    "num-test-traces", cl::init(0),
-    cl::desc("Keep aside <num-test-traces> traces when computing the function "
-             "order and instead use them to evaluate that order"),
-    cl::sub(OrderSubcommand));
+cl::opt<unsigned>
+    NumTestTraces("num-test-traces", cl::init(0),
+                  cl::desc("Keep aside the last <num-test-traces> traces in "
+                           "the profile when computing the function order and "
+                           "instead use them to evaluate that order"),
+                  cl::sub(OrderSubcommand));
 
 // We use this string to indicate that there are
 // multiple static functions map to the same name.
diff --git a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
index af0104da3341f..24586b5aa31af 100644
--- a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
+++ b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
@@ -32,14 +32,26 @@ TEST(BPFunctionNodeTest, Basic) {
 
   std::vector<BPFunctionNode> Nodes;
   TemporalProfTraceTy::createBPFunctionNodes(
-      {TemporalProfTraceTy({0, 1, 2, 3})}, Nodes);
-  EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
-                                          NodeIs(2, {}), NodeIs(3, {})));
+      {TemporalProfTraceTy({0, 1, 2, 3})}, Nodes, /*RemoveOutlierUNs=*/false);
+  // Utility nodes that are too infrequent or too prevalent are filtered out.
+  EXPECT_THAT(Nodes,
+              UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
+                                   NodeIs(2, {2}), NodeIs(3, {2})));
+
+  Nodes.clear();
+  TemporalProfTraceTy::createBPFunctionNodes(
+      {TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
+      Nodes, /*RemoveOutlierUNs=*/false);
+
+  EXPECT_THAT(Nodes,
+              UnorderedElementsAre(NodeIs(0, {0, 1, 2, 3}),
+                                   NodeIs(1, {1, 2, 3}), NodeIs(2, {2, 3, 5}),
+                                   NodeIs(3, {2, 3}), NodeIs(4, {3, 4, 5})));
 
   Nodes.clear();
   TemporalProfTraceTy::createBPFunctionNodes(
       {TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
-      Nodes);
+      Nodes, /*RemoveOutlierUNs=*/true);
 
   EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
                                           NodeIs(2, {5}), NodeIs(3, {}),