[llvm] [InstrProf] Evaluate function order using test traces (PR #92451)
Ellis Hoag via llvm-commits
llvm-commits at lists.llvm.org
Mon May 20 09:42:44 PDT 2024
https://github.com/ellishg updated https://github.com/llvm/llvm-project/pull/92451
>From 39b7dcaf3c26ca200dfb0010f8424126205c9899 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag at gmail.com>
Date: Thu, 16 May 2024 11:43:10 -0700
Subject: [PATCH 1/2] [InstrProf] Evaluate function order using test traces
The `llvm-profdata order` command is used to compute a function order
using traces from the input profile. Add the `--num-test-traces` flag to
keep aside N traces to evalute this order. These test traces are assumed
to be the actual function execution order in some experiment. The output
is a number that represents how many page faults we got. Lower is
better.
I tested on a large profile I already had.
```
llvm-profdata order default.profdata --num-test-traces=30
\# Ordered 149103 functions
\# Total area under the page fault curve: 2.271827e+09
...
```
I also improved `TemporalProfTraceTy::createBPFunctionNodes()` in a few
ways:
* Simplified how `UN`s are computed
* Output vector is not an aliased argument instead of a return
These changes improved the evaluation in my test.
```
llvm-profdata order default.profdata --num-test-traces=30
\# Ordered 149103 functions
\# Total area under the page fault curve: 2.268586e+09
...
```
---
llvm/include/llvm/ProfileData/InstrProf.h | 4 +-
llvm/lib/ProfileData/InstrProf.cpp | 73 +++++++++++--------
.../llvm-profdata/show-order-error.proftext | 27 +++++++
.../tools/llvm-profdata/show-order.proftext | 11 ++-
llvm/tools/llvm-profdata/llvm-profdata.cpp | 42 ++++++++++-
.../ProfileData/BPFunctionNodeTest.cpp | 31 ++++----
6 files changed, 132 insertions(+), 56 deletions(-)
create mode 100644 llvm/test/tools/llvm-profdata/show-order-error.proftext
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 88c7fe425b5a5..b06ec5283a78b 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -385,8 +385,8 @@ struct TemporalProfTraceTy {
/// Use a set of temporal profile traces to create a list of balanced
/// partitioning function nodes used by BalancedPartitioning to generate a
/// function order that reduces page faults during startup
- static std::vector<BPFunctionNode>
- createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces);
+ static void createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces,
+ std::vector<BPFunctionNode> &Nodes);
};
inline std::error_code make_error_code(instrprof_error E) {
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 806d01de1ada5..220d7e01f7ccd 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1002,46 +1002,57 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
ValueSites.emplace_back(VData, VData + N);
}
-std::vector<BPFunctionNode> TemporalProfTraceTy::createBPFunctionNodes(
- ArrayRef<TemporalProfTraceTy> Traces) {
+void TemporalProfTraceTy::createBPFunctionNodes(
+ ArrayRef<TemporalProfTraceTy> Traces, std::vector<BPFunctionNode> &Nodes) {
using IDT = BPFunctionNode::IDT;
using UtilityNodeT = BPFunctionNode::UtilityNodeT;
- // Collect all function IDs ordered by their smallest timestamp. This will be
- // used as the initial FunctionNode order.
- SetVector<IDT> FunctionIds;
- size_t LargestTraceSize = 0;
- for (auto &Trace : Traces)
- LargestTraceSize =
- std::max(LargestTraceSize, Trace.FunctionNameRefs.size());
- for (size_t Timestamp = 0; Timestamp < LargestTraceSize; Timestamp++)
- for (auto &Trace : Traces)
- if (Timestamp < Trace.FunctionNameRefs.size())
- FunctionIds.insert(Trace.FunctionNameRefs[Timestamp]);
-
- const int N = Log2_64(LargestTraceSize) + 1;
-
+ UtilityNodeT MaxUN = 0;
+ DenseMap<IDT, size_t> IdToFirstTimestamp;
+ DenseMap<IDT, UtilityNodeT> IdToFirstUN;
+ DenseMap<IDT, SmallVector<UtilityNodeT>> IdToUNs;
// TODO: We need to use the Trace.Weight field to give more weight to more
// important utilities
- DenseMap<IDT, SmallVector<UtilityNodeT, 4>> FuncGroups;
- for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
- auto &Trace = Traces[TraceIdx].FunctionNameRefs;
- for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) {
- for (int I = Log2_64(Timestamp + 1); I < N; I++) {
- auto FunctionId = Trace[Timestamp];
- UtilityNodeT GroupId = TraceIdx * N + I;
- FuncGroups[FunctionId].push_back(GroupId);
+ for (auto &Trace : Traces) {
+ size_t CutoffTimestamp = 1;
+ for (size_t Timestamp = 0; Timestamp < Trace.FunctionNameRefs.size();
+ Timestamp++) {
+ IDT Id = Trace.FunctionNameRefs[Timestamp];
+ auto [It, WasInserted] = IdToFirstTimestamp.try_emplace(Id, Timestamp);
+ if (!WasInserted)
+ It->getSecond() = std::min<size_t>(It->getSecond(), Timestamp);
+ if (Timestamp >= CutoffTimestamp) {
+ ++MaxUN;
+ CutoffTimestamp = 2 * Timestamp;
}
+ IdToFirstUN.try_emplace(Id, MaxUN);
}
+ for (auto &[Id, FirstUN] : IdToFirstUN)
+ for (auto UN = FirstUN; UN <= MaxUN; ++UN)
+ IdToUNs[Id].push_back(UN);
+ ++MaxUN;
+ IdToFirstUN.clear();
}
- std::vector<BPFunctionNode> Nodes;
- for (auto Id : FunctionIds) {
- auto &UNs = FuncGroups[Id];
- llvm::sort(UNs);
- UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end());
+ DenseMap<UtilityNodeT, unsigned> UNFrequency;
+ for (auto &[Id, UNs] : IdToUNs)
+ for (auto &UN : UNs)
+ ++UNFrequency[UN];
+ // Filter out rare and common utility nodes to make BalancedPartitioning more
+ // effective.
+ for (auto &[Id, UNs] : IdToUNs)
+ llvm::erase_if(UNs, [&](auto &UN) {
+ return UNFrequency[UN] <= 1 || 2 * UNFrequency[UN] > IdToUNs.size();
+ });
+
+ for (auto &[Id, UNs] : IdToUNs)
Nodes.emplace_back(Id, UNs);
- }
- return Nodes;
+
+ // Since BalancedPartitioning is sensitive to the initial order, we explicitly
+ // order nodes by their earliest timestamp.
+ llvm::sort(Nodes, [&](auto &L, auto &R) {
+ return std::make_pair(IdToFirstTimestamp[L.Id], L.Id) <
+ std::make_pair(IdToFirstTimestamp[R.Id], R.Id);
+ });
}
#define INSTR_PROF_COMMON_API_IMPL
diff --git a/llvm/test/tools/llvm-profdata/show-order-error.proftext b/llvm/test/tools/llvm-profdata/show-order-error.proftext
new file mode 100644
index 0000000000000..633f1a9949b6f
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/show-order-error.proftext
@@ -0,0 +1,27 @@
+# RUN: not llvm-profdata order %s --num-test-traces=10 2>&1 | FileCheck %s
+
+# CHECK: --num-test-traces must be smaller than the total number of traces
+
+# Header
+:ir
+:temporal_prof_traces
+# Num Traces
+1
+# Trace Stream Size:
+1
+# Weight
+1
+a, b
+
+a
+# Func Hash:
+0x1234
+# Num Counters:
+1
+# Counter Values:
+101
+
+b
+0x5678
+1
+202
diff --git a/llvm/test/tools/llvm-profdata/show-order.proftext b/llvm/test/tools/llvm-profdata/show-order.proftext
index 8ef26847ad77e..28eb1b9b42af7 100644
--- a/llvm/test/tools/llvm-profdata/show-order.proftext
+++ b/llvm/test/tools/llvm-profdata/show-order.proftext
@@ -1,4 +1,6 @@
-# RUN: llvm-profdata order %s | FileCheck %s
+# RUN: llvm-profdata order %s --num-test-traces=1 | FileCheck %s
+
+# CHECK: # Total area under the page fault curve: 4.000000e+00
# CHECK: a
# CHECK: b
@@ -9,9 +11,9 @@
:ir
:temporal_prof_traces
# Num Traces
-3
+4
# Trace Stream Size:
-3
+4
# Weight
1
a, main.c:b, c
@@ -21,6 +23,9 @@ a, x, main.c:b, c
# Weight
1
a, main.c:b, c
+# Weight
+1
+a, main.c:b, c, x
a
# Func Hash:
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 4126b55576ddc..70294c10d698c 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -340,7 +340,7 @@ cl::opt<unsigned long long> OverlapValueCutoff(
"profile with max count value greater then the parameter value"),
cl::sub(OverlapSubcommand));
-// Options unique to show subcommand.
+// Options specific to show subcommand.
cl::opt<bool> ShowCounts("counts", cl::init(false),
cl::desc("Show counter values for shown functions"),
cl::sub(ShowSubcommand));
@@ -439,6 +439,13 @@ cl::opt<bool> ShowProfileVersion("profile-version", cl::init(false),
cl::desc("Show profile version. "),
cl::sub(ShowSubcommand));
+// Options specific to order subcommand.
+cl::opt<unsigned> NumTestTraces(
+ "num-test-traces", cl::init(0),
+ cl::desc("Keep aside <num-test-traces> traces when computing the function "
+ "order and instead use them to evaluate that order"),
+ cl::sub(OrderSubcommand));
+
// We use this string to indicate that there are
// multiple static functions map to the same name.
const std::string DuplicateNameStr = "----";
@@ -3278,13 +3285,42 @@ static int order_main() {
// Read all entries
(void)I;
}
- auto &Traces = Reader->getTemporalProfTraces();
- auto Nodes = TemporalProfTraceTy::createBPFunctionNodes(Traces);
+ ArrayRef Traces = Reader->getTemporalProfTraces();
+ if (NumTestTraces && NumTestTraces >= Traces.size())
+ exitWithError(
+ "--" + NumTestTraces.ArgStr +
+ " must be smaller than the total number of traces: expected: < " +
+ Twine(Traces.size()) + ", actual: " + Twine(NumTestTraces));
+ ArrayRef TestTraces = Traces.take_back(NumTestTraces);
+ Traces = Traces.drop_back(NumTestTraces);
+
+ std::vector<BPFunctionNode> Nodes;
+ TemporalProfTraceTy::createBPFunctionNodes(Traces, Nodes);
BalancedPartitioningConfig Config;
BalancedPartitioning BP(Config);
BP.run(Nodes);
OS << "# Ordered " << Nodes.size() << " functions\n";
+ if (!TestTraces.empty()) {
+ // Since we don't know the symbol sizes, we assume 32 functions per page.
+ DenseMap<BPFunctionNode::IDT, unsigned> IdToPageNumber;
+ for (auto &Node : Nodes)
+ IdToPageNumber[Node.Id] = IdToPageNumber.size() / 32;
+
+ SmallSet<unsigned, 0> TouchedPages;
+ unsigned Area = 0;
+ for (auto &Trace : TestTraces) {
+ for (auto Id : Trace.FunctionNameRefs) {
+ auto It = IdToPageNumber.find(Id);
+ if (It == IdToPageNumber.end())
+ continue;
+ TouchedPages.insert(It->getSecond());
+ Area += TouchedPages.size();
+ }
+ TouchedPages.clear();
+ }
+ OS << "# Total area under the page fault curve: " << (float)Area << "\n";
+ }
OS << "# Warning: Mach-O may prefix symbols with \"_\" depending on the "
"linkage and this output does not take that into account. Some "
"post-processing may be required before passing to the linker via "
diff --git a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
index 6af6f1bcdc40a..af0104da3341f 100644
--- a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
+++ b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
@@ -8,7 +8,6 @@
#include "llvm/ProfileData/InstrProf.h"
#include "llvm/Support/BalancedPartitioning.h"
-#include "llvm/Testing/Support/SupportHelpers.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
@@ -31,22 +30,20 @@ TEST(BPFunctionNodeTest, Basic) {
UnorderedElementsAreArray(UNs)));
};
- auto Nodes = TemporalProfTraceTy::createBPFunctionNodes({
- TemporalProfTraceTy({0, 1, 2, 3}),
- });
- EXPECT_THAT(Nodes,
- UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
- NodeIs(2, {1, 2}), NodeIs(3, {2})));
-
- Nodes = TemporalProfTraceTy::createBPFunctionNodes({
- TemporalProfTraceTy({0, 1, 2, 3, 4}),
- TemporalProfTraceTy({4, 2}),
- });
-
- EXPECT_THAT(Nodes,
- UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
- NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}),
- NodeIs(4, {2, 3, 4, 5})));
+ std::vector<BPFunctionNode> Nodes;
+ TemporalProfTraceTy::createBPFunctionNodes(
+ {TemporalProfTraceTy({0, 1, 2, 3})}, Nodes);
+ EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
+ NodeIs(2, {}), NodeIs(3, {})));
+
+ Nodes.clear();
+ TemporalProfTraceTy::createBPFunctionNodes(
+ {TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
+ Nodes);
+
+ EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
+ NodeIs(2, {5}), NodeIs(3, {}),
+ NodeIs(4, {5})));
}
} // end namespace llvm
>From 3a6064767d9504a4db601fbfc0959b1a64fee38d Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag at gmail.com>
Date: Mon, 20 May 2024 09:14:59 -0700
Subject: [PATCH 2/2] Address review
---
llvm/include/llvm/ProfileData/InstrProf.h | 3 ++-
llvm/lib/ProfileData/InstrProf.cpp | 25 +++++++++++--------
llvm/tools/llvm-profdata/llvm-profdata.cpp | 11 ++++----
.../ProfileData/BPFunctionNodeTest.cpp | 20 ++++++++++++---
4 files changed, 38 insertions(+), 21 deletions(-)
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index b06ec5283a78b..324e6d27b43a6 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -386,7 +386,8 @@ struct TemporalProfTraceTy {
/// partitioning function nodes used by BalancedPartitioning to generate a
/// function order that reduces page faults during startup
static void createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces,
- std::vector<BPFunctionNode> &Nodes);
+ std::vector<BPFunctionNode> &Nodes,
+ bool RemoveOutlierUNs = true);
};
inline std::error_code make_error_code(instrprof_error E) {
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 220d7e01f7ccd..de287ee18fb43 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1003,7 +1003,8 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
}
void TemporalProfTraceTy::createBPFunctionNodes(
- ArrayRef<TemporalProfTraceTy> Traces, std::vector<BPFunctionNode> &Nodes) {
+ ArrayRef<TemporalProfTraceTy> Traces, std::vector<BPFunctionNode> &Nodes,
+ bool RemoveOutlierUNs) {
using IDT = BPFunctionNode::IDT;
using UtilityNodeT = BPFunctionNode::UtilityNodeT;
UtilityNodeT MaxUN = 0;
@@ -1033,16 +1034,18 @@ void TemporalProfTraceTy::createBPFunctionNodes(
IdToFirstUN.clear();
}
- DenseMap<UtilityNodeT, unsigned> UNFrequency;
- for (auto &[Id, UNs] : IdToUNs)
- for (auto &UN : UNs)
- ++UNFrequency[UN];
- // Filter out rare and common utility nodes to make BalancedPartitioning more
- // effective.
- for (auto &[Id, UNs] : IdToUNs)
- llvm::erase_if(UNs, [&](auto &UN) {
- return UNFrequency[UN] <= 1 || 2 * UNFrequency[UN] > IdToUNs.size();
- });
+ if (RemoveOutlierUNs) {
+ DenseMap<UtilityNodeT, unsigned> UNFrequency;
+ for (auto &[Id, UNs] : IdToUNs)
+ for (auto &UN : UNs)
+ ++UNFrequency[UN];
+ // Filter out utility nodes that are too infrequent or too prevalent to make
+ // BalancedPartitioning more effective.
+ for (auto &[Id, UNs] : IdToUNs)
+ llvm::erase_if(UNs, [&](auto &UN) {
+ return UNFrequency[UN] <= 1 || 2 * UNFrequency[UN] > IdToUNs.size();
+ });
+ }
for (auto &[Id, UNs] : IdToUNs)
Nodes.emplace_back(Id, UNs);
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 70294c10d698c..0796dc8f1567e 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -440,11 +440,12 @@ cl::opt<bool> ShowProfileVersion("profile-version", cl::init(false),
cl::sub(ShowSubcommand));
// Options specific to order subcommand.
-cl::opt<unsigned> NumTestTraces(
- "num-test-traces", cl::init(0),
- cl::desc("Keep aside <num-test-traces> traces when computing the function "
- "order and instead use them to evaluate that order"),
- cl::sub(OrderSubcommand));
+cl::opt<unsigned>
+ NumTestTraces("num-test-traces", cl::init(0),
+ cl::desc("Keep aside the last <num-test-traces> traces in "
+ "the profile when computing the function order and "
+ "instead use them to evaluate that order"),
+ cl::sub(OrderSubcommand));
// We use this string to indicate that there are
// multiple static functions map to the same name.
diff --git a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
index af0104da3341f..24586b5aa31af 100644
--- a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
+++ b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
@@ -32,14 +32,26 @@ TEST(BPFunctionNodeTest, Basic) {
std::vector<BPFunctionNode> Nodes;
TemporalProfTraceTy::createBPFunctionNodes(
- {TemporalProfTraceTy({0, 1, 2, 3})}, Nodes);
- EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
- NodeIs(2, {}), NodeIs(3, {})));
+ {TemporalProfTraceTy({0, 1, 2, 3})}, Nodes, /*RemoveOutlierUNs=*/false);
+ // Utility nodes that are too infrequent or too prevalent are filtered out.
+ EXPECT_THAT(Nodes,
+ UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
+ NodeIs(2, {2}), NodeIs(3, {2})));
+
+ Nodes.clear();
+ TemporalProfTraceTy::createBPFunctionNodes(
+ {TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
+ Nodes, /*RemoveOutlierUNs=*/false);
+
+ EXPECT_THAT(Nodes,
+ UnorderedElementsAre(NodeIs(0, {0, 1, 2, 3}),
+ NodeIs(1, {1, 2, 3}), NodeIs(2, {2, 3, 5}),
+ NodeIs(3, {2, 3}), NodeIs(4, {3, 4, 5})));
Nodes.clear();
TemporalProfTraceTy::createBPFunctionNodes(
{TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
- Nodes);
+ Nodes, /*RemoveOutlierUNs=*/true);
EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
NodeIs(2, {5}), NodeIs(3, {}),
More information about the llvm-commits
mailing list