[llvm] [llvm-profgen] Add --sample-period to estimate absolute counts (PR #99826)

Sun Jul 21 14:41:46 PDT 2024

https://github.com/tcreech-intel created https://github.com/llvm/llvm-project/pull/99826

Without `--sample-period`, no assumptions are made about perf profile sample frequencies. This is useful for comparing relative hotness of different program locations within the same profile.

With `--sample-period`, LBR- and IP-based profile hit counts are adjusted to estimate the absolute total event count for each program location. This makes it reasonable to compare hit counts between different profiles, e.g., between two LBR-based execution frequency profiles with different sampling periods or between LBR-based execution frequency profiles and IP-based branch mispredict profiles.

This functionality is in support of HWPGO[^1], which aims to enable feedback from a wider range of hardware events.

[^1]: https://llvm.org/devmtg/2024-04/slides/TechnicalTalks/Xiao-EnablingHW-BasedPGO.pdf

>From 5684a43d229628e1d618189c8389aff47f086274 Mon Sep 17 00:00:00 2001
From: Tim Creech <timothy.m.creech at intel.com>
Date: Sun, 21 Jul 2024 17:32:02 -0400
Subject: [PATCH] [llvm-profgen] Add --sample-period to estimate absolute
 counts

Without `--sample-period`, no assumptions are made about perf profile
sample frequencies. This is useful for comparing relative hotness
of different program locations within the same profile.

With `--sample-period`, LBR- and IP-based profile hit counts are
adjusted to estimate the absolute total event count for each program
location. This makes it reasonable to compare hit counts between
different profiles, e.g., between LBR-based execution frequency
profiles and IP-based branch mispredict profiles.
---
 .../tools/llvm-profgen/period-scaling.test    | 84 +++++++++++++++++++
 llvm/tools/llvm-profgen/PerfReader.cpp        | 14 ++++
 2 files changed, 98 insertions(+)
 create mode 100644 llvm/test/tools/llvm-profgen/period-scaling.test

diff --git a/llvm/test/tools/llvm-profgen/period-scaling.test b/llvm/test/tools/llvm-profgen/period-scaling.test
new file mode 100644
index 0000000000000..a44c9a78caeaf
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/period-scaling.test
@@ -0,0 +1,84 @@
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_inst_retired.near_taken:upp --sample-period=1000003
+// RUN: FileCheck %s --input-file %t --check-prefix=CHECK-RAW-PROFILE
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_inst_retired.near_taken:upp --sample-period=1000003
+// RUN: FileCheck %s --input-file %t --check-prefix=CHECK
+
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_misp_retired.all_branches:upp --leading-ip-only --sample-period=1000003
+// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED-RAW-PROFILE
+// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_misp_retired.all_branches:upp --leading-ip-only --sample-period=1000003
+// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED
+
+// Check that we can use perf event filtering to generate multiple types of
+// source-level profiles from a single perf profile. In this case, we generate
+// a typical execution frequency profile using br_inst_retired.near_taken LBRs,
+// and a branch mispredict profile using br_misp_retired.all_branches sample
+// IPs.
+
+// Check that we can use --sample-period to compute LBR and IP-based profiles
+// which have comparable and absolute magnitudes. For example, in this case the
+// branch of interest (at source line offset 4) is in a loop body which is
+// executed ~20M times in total, and it's mispredicted about 9M times, yielding
+// a mispredict rate of roughly 0.45.
+
+// The source example below is based on perfKernelCpp/cmov_3, except a
+// misleading builtin is used to persuade the compiler not to use cmov, which
+// induces branch mispredicts.
+
+// CHECK: sel_arr:652547082:0
+// CHECK:  3.1: 20225766
+// CHECK:  3.2: 20225766
+// CHECK:  4: 19838670
+// CHECK:  5: 20225766
+
+// UNPRED: sel_arr:18000054:0
+// UNPRED:  3.1: 0
+// UNPRED:  3.2: 0
+// UNPRED:  4: 9000027
+// UNPRED:  5: 0
+
+// CHECK-RAW-PROFILE:      3
+// CHECK-RAW-PROFILE-NEXT: 2f0-2fa:9774174
+// CHECK-RAW-PROFILE-NEXT: 2f0-310:10064496
+// CHECK-RAW-PROFILE-NEXT: 2ff-310:10161270
+
+// UNPRED-RAW-PROFILE:      1
+// UNPRED-RAW-PROFILE-NEXT: 2fa-2fa:9000027
+
+// original code:
+// icx -fprofile-sample-generate lit.c
+#include <stdlib.h>
+
+#define N 20000
+#define ITERS 10000
+
+static int *m_s1, *m_s2, *m_s3, *m_dst;
+
+void init(void) {
+    m_s1 = malloc(sizeof(int)*N);
+    m_s2 = malloc(sizeof(int)*N);
+    m_s3 = malloc(sizeof(int)*N);
+    m_dst = malloc(sizeof(int)*N);
+    srand(42);
+
+    for (int i = 0; i < N; i++) {
+        m_s1[i] = rand() % N;
+        m_s2[i] = 0;
+        m_s3[i] = 1;
+    }
+}
+
+void __attribute__((noinline)) sel_arr(int *dst, int *s1, int *s2, int *s3) {
+#pragma nounroll
+#pragma clang loop vectorize(disable) interleave(disable)
+    for (int i = 0; i < N; i++) {
+        int *p = __builtin_expect((s1[i] < 10035), 0) ? &s2[i] : &s3[i];
+        dst[i] = *p;
+    }
+}
+
+int main(void) {
+  init();
+  for(int i=0; i<ITERS; ++i)
+    sel_arr(m_dst, m_s1, m_s2, m_s3);
+  return 0;
+}
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index b4e4911fb8912..4041271cc0a82 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -53,6 +53,10 @@ static cl::alias
                           cl::desc("Comma-delimited version of -perf-event"),
                           cl::aliasopt(PerfEventFilter));
 
+static cl::opt<uint64_t>
+    SamplePeriod("sample-period", cl::init(1),
+                 cl::desc("The sampling period (-c) used for perf data"));
+
 extern cl::opt<std::string> PerfTraceFilename;
 extern cl::opt<bool> ShowDisassemblyOnly;
 extern cl::opt<bool> ShowSourceLocations;
@@ -1000,6 +1004,16 @@ void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) {
   if (extractLBRStack(TraceIt, Sample->LBRStack)) {
     warnIfMissingMMap();
     // Record LBR only samples by aggregation
+    // If a sampling period is given we can adjust the magnitude of sample
+    // counts to estimate the absolute magnitute.
+    if (SamplePeriod.getNumOccurrences()) {
+      Count *= SamplePeriod;
+      // If counts are LBR-based, as opposed to IP-based, then the magnitude is
+      // now amplified by roughly the LBR stack size. By adjusting this down, we
+      // can produce LBR-based and IP-based profiles with comparable magnitudes.
+      if (!LeadingIPOnly && Sample->LBRStack.size() > 1)
+        Count /= (Sample->LBRStack.size() - 1);
+    }
     AggregatedSamples[Hashable<PerfSample>(Sample)] += Count;
   }
 }