[libc-commits] [libc] [libc] Improve GPU benchmarking (PR #153512)
Leandro Lacerda via libc-commits
libc-commits at lists.llvm.org
Thu Aug 14 20:21:37 PDT 2025
https://github.com/leandrolcampos updated https://github.com/llvm/llvm-project/pull/153512
>From 6ccc76e04e234f3c85d74c46357b93006bc4f5be Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Sun, 10 Aug 2025 17:54:10 -0300
Subject: [PATCH 01/10] Replace `rand` and `srand` with per-thread RNG for
reproducibility and fairness
---
libc/benchmarks/gpu/CMakeLists.txt | 4 --
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 5 +--
libc/benchmarks/gpu/LibcGpuBenchmark.h | 51 +++++++++++++++++++-----
3 files changed, 42 insertions(+), 18 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 6ec64bf270b53..ce3b0228c2076 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -22,8 +22,6 @@ function(add_benchmark benchmark_name)
${BENCHMARK_LINK_LIBRARIES}
DEPENDS
libc.src.stdio.printf
- libc.src.stdlib.srand
- libc.src.stdlib.rand
${BENCHMARK_DEPENDS}
${BENCHMARK_UNPARSED_ARGUMENTS}
COMPILE_OPTIONS
@@ -64,8 +62,6 @@ add_unittest_framework_library(
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
libc.src.time.clock
- libc.src.stdlib.rand
- libc.src.stdlib.srand
libc.benchmarks.gpu.timing.timing
libc.src.stdio.printf
)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 57ff5b9fdb846..4009a18c475cb 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -9,7 +9,6 @@
#include "src/__support/macros/config.h"
#include "src/__support/time/gpu/time_utils.h"
#include "src/stdio/printf.h"
-#include "src/stdlib/srand.h"
namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
@@ -139,10 +138,8 @@ void print_header() {
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
- if (id == 0) {
+ if (id == 0)
print_header();
- LIBC_NAMESPACE::srand(gpu::processor_clock());
- }
gpu::sync_threads();
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index a6cf62dd30ce5..52c5bb1cabe34 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -11,7 +11,6 @@
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
-#include "src/stdlib/rand.h"
#include "src/time/clock.h"
namespace LIBC_NAMESPACE_DECL {
@@ -109,6 +108,35 @@ class Benchmark {
}
};
+class RandomGenerator {
+ uint64_t state;
+
+ static inline uint64_t splitmix64(uint64_t x) noexcept {
+ x += 0x9E3779B97F4A7C15ULL;
+ x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
+ x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
+ x = (x ^ (x >> 31));
+ return x ? x : 0x9E3779B97F4A7C15ULL;
+ }
+
+public:
+ explicit inline RandomGenerator(uint64_t seed) noexcept
+ : state(splitmix64(seed)) {}
+
+ inline uint64_t next64() noexcept {
+ uint64_t x = state;
+ x ^= x >> 12;
+ x ^= x << 25;
+ x ^= x >> 27;
+ state = x;
+ return x * 0x2545F4914F6CDD1DULL;
+ }
+
+ inline uint32_t next32() noexcept {
+ return static_cast<uint32_t>(next64() >> 32);
+ }
+};
+
// We want our random values to be approximately
// Output: a random number with the exponent field between min_exp and max_exp,
// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
@@ -117,7 +145,8 @@ class Benchmark {
// EXP_BIAS + 1 corresponding to inf or nan.
template <typename T>
static T
-get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+get_rand_input(RandomGenerator &rng,
+ int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
@@ -126,10 +155,9 @@ get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
uint64_t, uint32_t>;
RandType bits;
if constexpr (cpp::is_same_v<T, uint64_t>)
- bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
- static_cast<uint64_t>(LIBC_NAMESPACE::rand());
+ bits = rng.next64();
else
- bits = LIBC_NAMESPACE::rand();
+ bits = rng.next32();
double scale =
static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
FPBits fp(bits);
@@ -146,10 +174,12 @@ template <typename T> class MathPerf {
public:
template <size_t N = 1>
- static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
+ static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
+ uint64_t seed = N) {
cpp::array<T, N> inputs;
+ RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
for (size_t i = 0; i < N; ++i)
- inputs[i] = get_rand_input<T>(min_exp, max_exp);
+ inputs[i] = get_rand_input<T>(rng, min_exp, max_exp);
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
@@ -160,12 +190,13 @@ template <typename T> class MathPerf {
template <size_t N = 1>
static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
int arg1_max_exp, int arg2_min_exp,
- int arg2_max_exp) {
+ int arg2_max_exp, uint64_t seed = N) {
cpp::array<T, N> inputs1;
cpp::array<T, N> inputs2;
+ RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
for (size_t i = 0; i < N; ++i) {
- inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
- inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
+ inputs1[i] = get_rand_input<T>(rng, arg1_min_exp, arg1_max_exp);
+ inputs2[i] = get_rand_input<T>(rng, arg2_min_exp, arg2_max_exp);
}
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
>From f36f86fd807b1f6efba6b32f7d8fec3f3c5fc4dd Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Sun, 10 Aug 2025 23:25:03 -0300
Subject: [PATCH 02/10] Fix random input generation
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 12 +-
libc/benchmarks/gpu/LibcGpuBenchmark.h | 123 ++++++++++++------
libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 2 +
.../gpu/src/ctype/isalnum_benchmark.cpp | 7 +-
.../gpu/src/ctype/isalpha_benchmark.cpp | 3 +-
5 files changed, 98 insertions(+), 49 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 4009a18c475cb..c42de2ada8704 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,4 +1,5 @@
#include "LibcGpuBenchmark.h"
+#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
@@ -160,8 +161,9 @@ void Benchmark::run_benchmarks() {
gpu::sync_threads();
}
-BenchmarkResult benchmark(const BenchmarkOptions &options,
- cpp::function<uint64_t(void)> wrapper_func) {
+BenchmarkResult
+benchmark(const BenchmarkOptions &options,
+ const cpp::function<uint64_t(uint32_t)> &wrapper_func) {
BenchmarkResult result;
RuntimeEstimationProgression rep;
uint32_t total_iterations = 0;
@@ -181,11 +183,13 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
for (int i = 0; i < overhead_iterations; i++)
overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
+ uint32_t call_index = 0;
+
for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
uint64_t sample_cycles = 0;
const clock_t start = static_cast<double>(clock());
for (uint32_t i = 0; i < iterations; i++) {
- auto wrapper_intermediate = wrapper_func();
+ auto wrapper_intermediate = wrapper_func(call_index++);
uint64_t current_result = wrapper_intermediate - overhead;
max = cpp::max(max, current_result);
min = cpp::min(min, current_result);
@@ -223,7 +227,7 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
result.total_iterations = total_iterations;
result.total_time = total_time / total_iterations;
return result;
-};
+}
} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 52c5bb1cabe34..2a8cef6eff190 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -4,6 +4,7 @@
#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
#include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
@@ -77,17 +78,18 @@ struct BenchmarkResult {
clock_t total_time = 0;
};
-BenchmarkResult benchmark(const BenchmarkOptions &options,
- cpp::function<uint64_t(void)> wrapper_func);
+BenchmarkResult
+benchmark(const BenchmarkOptions &options,
+ const cpp::function<uint64_t(uint32_t)> &wrapper_func);
class Benchmark {
- const cpp::function<uint64_t(void)> func;
+ const cpp::function<uint64_t(uint32_t)> func;
const cpp::string_view suite_name;
const cpp::string_view test_name;
const uint32_t num_threads;
public:
- Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
+ Benchmark(cpp::function<uint64_t(uint32_t)> func, char const *suite_name,
char const *test_name, uint32_t num_threads)
: func(func), suite_name(suite_name), test_name(test_name),
num_threads(num_threads) {
@@ -111,7 +113,7 @@ class Benchmark {
class RandomGenerator {
uint64_t state;
- static inline uint64_t splitmix64(uint64_t x) noexcept {
+ static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept {
x += 0x9E3779B97F4A7C15ULL;
x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
@@ -120,10 +122,10 @@ class RandomGenerator {
}
public:
- explicit inline RandomGenerator(uint64_t seed) noexcept
+ explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept
: state(splitmix64(seed)) {}
- inline uint64_t next64() noexcept {
+ LIBC_INLINE uint64_t next64() noexcept {
uint64_t x = state;
x ^= x >> 12;
x ^= x << 25;
@@ -132,52 +134,86 @@ class RandomGenerator {
return x * 0x2545F4914F6CDD1DULL;
}
- inline uint32_t next32() noexcept {
+ LIBC_INLINE uint32_t next32() noexcept {
return static_cast<uint32_t>(next64() >> 32);
}
};
-// We want our random values to be approximately
-// Output: a random number with the exponent field between min_exp and max_exp,
-// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
-// Caveats:
-// -EXP_BIAS corresponding to denormal values,
-// EXP_BIAS + 1 corresponding to inf or nan.
+// We want random floating-point values whose *unbiased* exponent e is
+// approximately uniform in [min_exp, max_exp]. That is,
+// 2^min_exp <= |value| < 2^(max_exp + 1).
+// Caveats / boundaries:
+// - e = -EXP_BIAS ==> subnormal range (biased exponent = 0). We ensure a
+// non-zero mantissa so we don't accidentally produce 0.
+// - e in [1 - EXP_BIAS, EXP_BIAS] ==> normal numbers.
+// - e = EXP_BIAS + 1 ==> Inf/NaN. We do not include it by default; max_exp
+// defaults to EXP_BIAS.
template <typename T>
static T
get_rand_input(RandomGenerator &rng,
- int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
- int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
+ int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+ int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-
- // Required to correctly instantiate FPBits for floats and doubles.
- using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
- uint64_t, uint32_t>;
- RandType bits;
- if constexpr (cpp::is_same_v<T, uint64_t>)
- bits = rng.next64();
- else
- bits = rng.next32();
- double scale =
- static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
- FPBits fp(bits);
- fp.set_biased_exponent(
- static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
- return fp.get_val();
+ using Storage = typename FPBits::StorageType;
+
+ // Sanitize and clamp requested range to what the format supports
+ if (min_exp > max_exp) {
+ auto tmp = min_exp;
+ min_exp = max_exp;
+ max_exp = tmp;
+ };
+ min_exp = cpp::max(min_exp, -FPBits::EXP_BIAS);
+ max_exp = cpp::min(max_exp, FPBits::EXP_BIAS);
+
+ // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo
+ // bias
+ auto sample_in_range = [&](uint64_t r) -> int32_t {
+ const uint64_t range = static_cast<uint64_t>(
+ static_cast<int64_t>(max_exp) - static_cast<int64_t>(min_exp) + 1);
+ const uint64_t threshold = (-range) % range;
+ while (r < threshold)
+ r = rng.next64();
+ return static_cast<int32_t>(min_exp + static_cast<int64_t>(r % range));
+ };
+ const int32_t e = sample_in_range(rng.next64());
+
+ // Start from random bits to get random sign and mantissa
+ FPBits xbits([&] {
+ if constexpr (cpp::is_same_v<T, double>)
+ return FPBits(rng.next64());
+ else
+ return FPBits(rng.next32());
+ }());
+
+ if (e == -FPBits::EXP_BIAS) {
+ // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0
+ xbits.set_biased_exponent(Storage(0));
+ if (xbits.get_mantissa() == Storage(0))
+ xbits.set_mantissa(Storage(1));
+ } else {
+ // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS]
+ const int32_t biased = e + FPBits::EXP_BIAS;
+ xbits.set_biased_exponent(static_cast<Storage>(biased));
+ }
+ return xbits.get_val();
}
template <typename T> class MathPerf {
- using FPBits = fputil::FPBits<T>;
- using StorageType = typename FPBits::StorageType;
- static constexpr StorageType UIntMax =
- cpp::numeric_limits<StorageType>::max();
+ static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) {
+ const uint64_t tid = gpu::get_thread_id();
+ return base_seed ^ (salt << 32) ^ (tid * 0x9E3779B97F4A7C15ULL);
+ }
public:
template <size_t N = 1>
static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
- uint64_t seed = N) {
+ uint32_t call_index) {
cpp::array<T, N> inputs;
- RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
+
+ uint64_t base_seed = static_cast<uint64_t>(call_index);
+ uint64_t salt = static_cast<uint64_t>(N);
+ RandomGenerator rng(make_seed(base_seed, salt));
+
for (size_t i = 0; i < N; ++i)
inputs[i] = get_rand_input<T>(rng, min_exp, max_exp);
@@ -186,14 +222,18 @@ template <typename T> class MathPerf {
return total_time / N;
}
- // Throughput benchmarking for functions that take 2 inputs.
template <size_t N = 1>
static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
int arg1_max_exp, int arg2_min_exp,
- int arg2_max_exp, uint64_t seed = N) {
+ int arg2_max_exp,
+ uint32_t call_index) {
cpp::array<T, N> inputs1;
cpp::array<T, N> inputs2;
- RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
+
+ uint64_t base_seed = static_cast<uint64_t>(call_index);
+ uint64_t salt = static_cast<uint64_t>(N);
+ RandomGenerator rng(make_seed(base_seed, salt));
+
for (size_t i = 0; i < N; ++i) {
inputs1[i] = get_rand_input<T>(rng, arg1_min_exp, arg1_max_exp);
inputs2[i] = get_rand_input<T>(rng, arg2_min_exp, arg2_max_exp);
@@ -224,4 +264,5 @@ template <typename T> class MathPerf {
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
LIBC_NAMESPACE::gpu::get_lane_size())
-#endif
+
+#endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index f277624dbb901..77e2bbe538b1f 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -7,6 +7,7 @@ add_benchmark(
SRCS
isalnum_benchmark.cpp
DEPENDS
+ libc.hdr.stdint_proxy
libc.src.ctype.isalnum
LOADER_ARGS
--threads 64
@@ -19,5 +20,6 @@ add_benchmark(
SRCS
isalpha_benchmark.cpp
DEPENDS
+ libc.hdr.stdint_proxy
libc.src.ctype.isalpha
)
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index ffa5a99860bfc..28b1ee52c8dfa 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -1,8 +1,9 @@
#include "benchmarks/gpu/LibcGpuBenchmark.h"
+#include "hdr/stdint_proxy.h"
#include "src/ctype/isalnum.h"
-uint64_t BM_IsAlnum() {
+uint64_t BM_IsAlnum(uint32_t /*call_index*/) {
char x = 'c';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
}
@@ -12,13 +13,13 @@ SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread,
SINGLE_WAVE_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleWave,
BM_IsAlnum);
-uint64_t BM_IsAlnumCapital() {
+uint64_t BM_IsAlnumCapital(uint32_t /*call_index*/) {
char x = 'A';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
}
BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumCapital, BM_IsAlnumCapital);
-uint64_t BM_IsAlnumNotAlnum() {
+uint64_t BM_IsAlnumNotAlnum(uint32_t /*call_index*/) {
char x = '{';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
}
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
index 2038eb89bc77b..bff4edea8b690 100644
--- a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -1,8 +1,9 @@
#include "benchmarks/gpu/LibcGpuBenchmark.h"
+#include "hdr/stdint_proxy.h"
#include "src/ctype/isalpha.h"
-uint64_t BM_IsAlpha() {
+uint64_t BM_IsAlpha(uint32_t /*call_index*/) {
char x = 'c';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
}
>From aa6a03d928923a85c79cb76683d8c022e27c0261 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Tue, 12 Aug 2025 23:10:36 -0300
Subject: [PATCH 03/10] Fix standard deviation
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 47 ++++++++------
libc/benchmarks/gpu/LibcGpuBenchmark.h | 83 ++++++++++++++++++------
2 files changed, 90 insertions(+), 40 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index c42de2ada8704..3c1eef22414a5 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -166,15 +166,13 @@ benchmark(const BenchmarkOptions &options,
const cpp::function<uint64_t(uint32_t)> &wrapper_func) {
BenchmarkResult result;
RuntimeEstimationProgression rep;
- uint32_t total_iterations = 0;
uint32_t iterations = options.initial_iterations;
+
if (iterations < 1u)
iterations = 1;
uint32_t samples = 0;
uint64_t total_time = 0;
- uint64_t best_guess = 0;
- uint64_t cycles_squared = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;
@@ -186,46 +184,55 @@ benchmark(const BenchmarkOptions &options,
uint32_t call_index = 0;
for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
- uint64_t sample_cycles = 0;
- const clock_t start = static_cast<double>(clock());
- for (uint32_t i = 0; i < iterations; i++) {
+ RefinableRuntimeEstimator sample_estimator;
+
+ const clock_t start = clock();
+ while (sample_estimator.get_iterations() < iterations) {
auto wrapper_intermediate = wrapper_func(call_index++);
- uint64_t current_result = wrapper_intermediate - overhead;
+ uint64_t current_result =
+ wrapper_intermediate < overhead ? 0 : wrapper_intermediate - overhead;
max = cpp::max(max, current_result);
min = cpp::min(min, current_result);
- sample_cycles += current_result;
+ sample_estimator.update(current_result);
}
const clock_t end = clock();
+
const clock_t duration_ns =
((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
total_time += duration_ns;
time_budget -= duration_ns;
samples++;
- cycles_squared += sample_cycles * sample_cycles;
- total_iterations += iterations;
- const double change_ratio =
- rep.compute_improvement({iterations, sample_cycles});
- best_guess = rep.current_estimation;
+ const double change_ratio = rep.compute_improvement(sample_estimator);
if (samples >= options.max_samples || iterations >= options.max_iterations)
break;
+
+ const auto total_iterations = rep.get_estimator().get_iterations();
+
if (total_time >= options.min_duration && samples >= options.min_samples &&
total_iterations >= options.min_iterations &&
change_ratio < options.epsilon)
break;
- iterations *= options.scaling_factor;
+ iterations = static_cast<uint32_t>(iterations * options.scaling_factor);
}
- result.cycles = best_guess;
- result.standard_deviation = fputil::sqrt<double>(
- static_cast<double>(cycles_squared) / total_iterations -
- static_cast<double>(best_guess * best_guess));
+
+ const auto &estimator = rep.get_estimator();
+ result.cycles = static_cast<uint64_t>(estimator.get_mean());
+ result.standard_deviation = estimator.get_stddev();
+
result.min = min;
result.max = max;
result.samples = samples;
- result.total_iterations = total_iterations;
- result.total_time = total_time / total_iterations;
+
+ result.total_iterations = estimator.get_iterations();
+ if (result.total_iterations > 0) {
+ result.total_time = total_time / result.total_iterations;
+ } else {
+ result.total_time = 0;
+ }
+
return result;
}
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2a8cef6eff190..96f3433fae77c 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -11,6 +11,7 @@
#include "src/__support/CPP/string_view.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/macros/config.h"
#include "src/time/clock.h"
@@ -30,40 +31,82 @@ struct BenchmarkOptions {
double scaling_factor = 1.4;
};
-struct Measurement {
+class RefinableRuntimeEstimator {
uint32_t iterations = 0;
- uint64_t elapsed_cycles = 0;
-};
-
-class RefinableRuntimeEstimation {
- uint64_t total_cycles = 0;
- uint32_t total_iterations = 0;
+ uint64_t sum_of_cycles = 0;
+ uint64_t sum_of_squared_cycles = 0;
public:
- uint64_t update(const Measurement &M) {
- total_cycles += M.elapsed_cycles;
- total_iterations += M.iterations;
- return total_cycles / total_iterations;
+ void update(uint64_t cycles) noexcept {
+ iterations += 1;
+ sum_of_cycles += cycles;
+ sum_of_squared_cycles += cycles * cycles;
+ }
+
+ double get_mean() const noexcept {
+ if (iterations == 0)
+ return 0.0;
+
+ return static_cast<double>(sum_of_cycles) / iterations;
+ }
+
+ void update(const RefinableRuntimeEstimator &other) noexcept {
+ iterations += other.iterations;
+ sum_of_cycles += other.sum_of_cycles;
+ sum_of_squared_cycles += other.sum_of_squared_cycles;
+ }
+
+ double get_variance() const noexcept {
+ if (iterations == 0)
+ return 0.0;
+
+ const double num = static_cast<double>(iterations);
+ const double sum_x = static_cast<double>(sum_of_cycles);
+ const double sum_x2 = static_cast<double>(sum_of_squared_cycles);
+
+ const double mean_of_squares = sum_x2 / num;
+ const double mean = sum_x / num;
+ const double mean_squared = mean * mean;
+ const double variance = mean_of_squares - mean_squared;
+
+ return variance < 0.0 ? 0.0 : variance;
+ }
+
+ double get_stddev() const noexcept {
+ return fputil::sqrt<double>(get_variance());
}
+
+ uint32_t get_iterations() const noexcept { return iterations; }
};
// Tracks the progression of the runtime estimation
class RuntimeEstimationProgression {
- RefinableRuntimeEstimation rre;
+ RefinableRuntimeEstimator estimator;
+ double current_mean = 0.0;
public:
- uint64_t current_estimation = 0;
+ const RefinableRuntimeEstimator &get_estimator() const noexcept {
+ return estimator;
+ }
+
+ double
+ compute_improvement(const RefinableRuntimeEstimator &sample_estimator) {
+ if (sample_estimator.get_iterations() == 0)
+ return 1.0;
- double compute_improvement(const Measurement &M) {
- const uint64_t new_estimation = rre.update(M);
- double ratio =
- (static_cast<double>(current_estimation) / new_estimation) - 1.0;
+ estimator.update(sample_estimator);
+
+ const double new_mean = estimator.get_mean();
+ if (current_mean == 0.0 || new_mean == 0.0) {
+ current_mean = new_mean;
+ return 1.0;
+ }
- // Get absolute value
+ double ratio = (current_mean / new_mean) - 1.0;
if (ratio < 0)
- ratio *= -1;
+ ratio = -ratio;
- current_estimation = new_estimation;
+ current_mean = new_mean;
return ratio;
}
};
>From 6e60a3d946f25946162e1d974bb345fe4c59821d Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 13 Aug 2025 15:38:44 -0300
Subject: [PATCH 04/10] Fix throughput overhead
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 9 +-
libc/benchmarks/gpu/LibcGpuBenchmark.h | 2 +
.../gpu/timing/amdgpu/CMakeLists.txt | 3 +-
libc/benchmarks/gpu/timing/amdgpu/timing.h | 106 +++++++++++++++---
.../gpu/timing/nvptx/CMakeLists.txt | 3 +-
libc/benchmarks/gpu/timing/nvptx/timing.h | 100 +++++++++++++++--
6 files changed, 187 insertions(+), 36 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 3c1eef22414a5..13769d063e1b9 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -176,11 +176,6 @@ benchmark(const BenchmarkOptions &options,
uint64_t min = UINT64_MAX;
uint64_t max = 0;
- uint64_t overhead = UINT64_MAX;
- int overhead_iterations = 10;
- for (int i = 0; i < overhead_iterations; i++)
- overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
-
uint32_t call_index = 0;
for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
@@ -188,9 +183,7 @@ benchmark(const BenchmarkOptions &options,
const clock_t start = clock();
while (sample_estimator.get_iterations() < iterations) {
- auto wrapper_intermediate = wrapper_func(call_index++);
- uint64_t current_result =
- wrapper_intermediate < overhead ? 0 : wrapper_intermediate - overhead;
+ auto current_result = wrapper_func(call_index++);
max = cpp::max(max, current_result);
min = cpp::min(min, current_result);
sample_estimator.update(current_result);
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 96f3433fae77c..21a074467c268 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -248,6 +248,7 @@ template <typename T> class MathPerf {
}
public:
+ // Returns cycles-per-call (lower is better)
template <size_t N = 1>
static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
uint32_t call_index) {
@@ -265,6 +266,7 @@ template <typename T> class MathPerf {
return total_time / N;
}
+ // Returns cycles-per-call (lower is better)
template <size_t N = 1>
static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
int arg1_max_exp, int arg2_min_exp,
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index dd7c2d342f70f..d6a89d04dab97 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -7,6 +7,7 @@ add_header_library(
libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
- libc.src.__support.CPP.type_traits
+ libc.src.__support.CPP.algorithm
libc.src.__support.CPP.array
+ libc.src.__support.CPP.type_traits
)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 37dbb9af5976b..90ff9e33c08bb 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -10,6 +10,7 @@
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
#include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/type_traits.h"
@@ -105,10 +106,11 @@ template <typename F, typename T1, typename T2>
return stop - start;
}
-// Provides throughput benchmarking.
-template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t
-throughput(F f, const cpp::array<T, N> &inputs) {
+// Provides the *baseline* for throughput: measures loop and measurement costs
+// without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t
+throughput_baseline(const cpp::array<T, N> &inputs) {
asm("" ::"v"(&inputs));
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
@@ -116,24 +118,94 @@ throughput(F f, const cpp::array<T, N> &inputs) {
asm("" ::"s"(start));
+ T result{};
for (auto input : inputs) {
- auto result = f(input);
+ asm("" ::"v"(input));
+ result = input;
+ asm("" ::"v"(result));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+ asm("" ::"s"(stop));
+
+ volatile auto output = result;
+ (void)output;
+
+ return stop - start;
+}
+
+// Provides throughput benchmarking
+template <typename F, typename T, size_t N>
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
+ uint64_t baseline = UINT64_MAX;
+ for (int i = 0; i < 5; ++i)
+ baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs));
+
+ asm("" ::"v"(&inputs));
+
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+ uint64_t start = gpu::processor_clock();
+ asm("" ::"s"(start));
+
+ T result{};
+ for (auto input : inputs) {
+ asm("" ::"v"(input));
+ result = f(input);
asm("" ::"v"(result));
}
uint64_t stop = gpu::processor_clock();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"s"(stop));
+
+ volatile auto output = result;
+ (void)output;
+
+ const uint64_t measured = stop - start;
+ return measured > baseline ? (measured - baseline) : 0;
+}
+
+// Provides the *baseline* for throughput with 2 arguments: measures loop and
+// measurement costs without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t throughput_baseline(
+ const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+ asm("" ::"v"(&inputs1), "v"(&inputs2));
+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"s"(start));
+
+ T result{};
+ for (size_t i = 0; i < N; i++) {
+ T x = inputs1[i];
+ T y = inputs2[i];
+ asm("" ::"v"(x), "v"(y));
+ result = x;
+ asm("" ::"v"(result));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+ asm("" ::"s"(stop));
+
+ volatile auto output = result;
+ (void)output;
- // Return the time elapsed.
return stop - start;
}
// Provides throughput benchmarking for 2 arguments (e.g. atan2())
template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
- F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
+ const cpp::array<T, N> &inputs2) {
+ uint64_t baseline = UINT64_MAX;
+ for (int i = 0; i < 5; ++i)
+ baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs1, inputs2));
+
asm("" ::"v"(&inputs1), "v"(&inputs2));
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
@@ -141,18 +213,24 @@ template <typename F, typename T, size_t N>
asm("" ::"s"(start));
- for (size_t i = 0; i < inputs1.size(); i++) {
- auto result = f(inputs1[i], inputs2[i]);
-
+ T result{};
+ for (size_t i = 0; i < N; i++) {
+ T x = inputs1[i];
+ T y = inputs2[i];
+ asm("" ::"v"(x), "v"(y));
+ result = f(x, y);
asm("" ::"v"(result));
}
uint64_t stop = gpu::processor_clock();
- asm("" ::"s"(stop));
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+ asm("" ::"s"(stop));
- // Return the time elapsed.
- return stop - start;
+ volatile auto output = result;
+ (void)output;
+
+ const uint64_t measured = stop - start;
+ return measured > baseline ? (measured - baseline) : 0;
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index a19c16ee4e44d..801080e7a6e98 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -7,6 +7,7 @@ add_header_library(
libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
- libc.src.__support.CPP.type_traits
+ libc.src.__support.CPP.algorithm
libc.src.__support.CPP.array
+ libc.src.__support.CPP.type_traits
)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 3c729636367aa..e0a069c6c5454 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -10,6 +10,7 @@
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
#include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/type_traits.h"
@@ -95,10 +96,42 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
return stop - start;
}
-// Provides throughput benchmarking.
+// Provides the *baseline* for throughput: measures loop and measurement costs
+// without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t
+throughput_baseline(const cpp::array<T, N> &inputs) {
+ asm("" ::"r"(&inputs));
+
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"llr"(start));
+
+ T result{};
+ for (auto input : inputs) {
+ asm("" ::"r"(input));
+ result = input;
+ asm("" ::"r"(result));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+ asm("" ::"r"(stop));
+
+ volatile auto output = result;
+ (void)output;
+
+ return stop - start;
+}
+
+// Provides throughput benchmarking
template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t
-throughput(F f, const cpp::array<T, N> &inputs) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
+ uint64_t baseline = UINT64_MAX;
+ for (int i = 0; i < 5; ++i)
+ baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs));
+
asm("" ::"r"(&inputs));
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
@@ -106,7 +139,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
asm("" ::"llr"(start));
- uint64_t result;
+ T result{};
for (auto input : inputs) {
asm("" ::"r"(input));
result = f(input);
@@ -116,16 +149,53 @@ throughput(F f, const cpp::array<T, N> &inputs) {
uint64_t stop = gpu::processor_clock();
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
+
volatile auto output = result;
+ (void)output;
+
+ const uint64_t measured = stop - start;
+ return measured > baseline ? (measured - baseline) : 0;
+}
+
+// Provides the *baseline* for throughput with 2 arguments: measures loop and
+// measurement costs without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t throughput_baseline(
+ const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+ asm("" ::"r"(&inputs1), "r"(&inputs2));
+
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"llr"(start));
+
+ T result{};
+ for (size_t i = 0; i < N; i++) {
+ T x = inputs1[i];
+ T y = inputs2[i];
+ asm("" ::"r"(x), "r"(y));
+ result = x;
+ asm("" ::"r"(result));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+ asm("" ::"r"(stop));
+
+ volatile auto output = result;
+ (void)output;
- // Return the time elapsed.
return stop - start;
}
// Provides throughput benchmarking for 2 arguments (e.g. atan2())
template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
- F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
+ const cpp::array<T, N> &inputs2) {
+ uint64_t baseline = UINT64_MAX;
+ for (int i = 0; i < 5; ++i)
+ baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs1, inputs2));
+
asm("" ::"r"(&inputs1), "r"(&inputs2));
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
@@ -133,20 +203,26 @@ template <typename F, typename T, size_t N>
asm("" ::"llr"(start));
- uint64_t result;
- for (size_t i = 0; i < inputs1.size(); i++) {
- result = f(inputs1[i], inputs2[i]);
+ T result{};
+ for (size_t i = 0; i < N; i++) {
+ T x = inputs1[i];
+ T y = inputs2[i];
+ asm("" ::"r"(x), "r"(y));
+ result = f(x, y);
asm("" ::"r"(result));
}
uint64_t stop = gpu::processor_clock();
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
+
volatile auto output = result;
+ (void)output;
- // Return the time elapsed.
- return stop - start;
+ const uint64_t measured = stop - start;
+ return measured > baseline ? (measured - baseline) : 0;
}
+
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
>From dc7436f9842a42d7e6cc2328681a57cc3c16320f Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 13 Aug 2025 20:38:58 -0300
Subject: [PATCH 05/10] Adapt math benchmarks
---
libc/benchmarks/gpu/src/math/CMakeLists.txt | 10 -------
.../gpu/src/math/atan2_benchmark.cpp | 18 ++++++-------
.../benchmarks/gpu/src/math/sin_benchmark.cpp | 26 +++++++------------
3 files changed, 18 insertions(+), 36 deletions(-)
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 7a12ce4e61c9e..8417f23c124a0 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -34,11 +34,6 @@ add_benchmark(
libc.hdr.stdint_proxy
libc.src.math.sin
libc.src.math.sinf
- libc.src.stdlib.srand
- libc.src.stdlib.rand
- libc.src.__support.FPUtil.fp_bits
- libc.src.__support.CPP.bit
- libc.src.__support.CPP.array
COMPILE_OPTIONS
${math_benchmark_flags}
LOADER_ARGS
@@ -54,11 +49,6 @@ add_benchmark(
DEPENDS
libc.hdr.stdint_proxy
libc.src.math.atan2
- libc.src.stdlib.srand
- libc.src.stdlib.rand
- libc.src.__support.FPUtil.fp_bits
- libc.src.__support.CPP.bit
- libc.src.__support.CPP.array
COMPILE_OPTIONS
${math_benchmark_flags}
LOADER_ARGS
diff --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
index 1f91a9a35c373..82bb0c5d7de49 100644
--- a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
@@ -1,27 +1,27 @@
#include "benchmarks/gpu/LibcGpuBenchmark.h"
+#include "hdr/stdint_proxy.h"
#include "src/math/atan2.h"
-#include "src/stdlib/rand.h"
#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
#include "platform.h"
#endif
-#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \
- []() { \
+#define BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, N) \
+ [](uint32_t call_index) { \
return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range< \
- N>(Func, MIN_EXP, MAX_EXP, MIN_EXP, MAX_EXP); \
+ N>(Func, MinExp, MaxExp, MinExp, MaxExp, call_index); \
}
-#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP) \
+#define BENCH(T, Name, Func, MinExp, MaxExp) \
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1, \
- BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1)); \
+ BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 1)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_128, \
- BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128)); \
+ BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 128)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1024, \
- BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \
+ BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 1024)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_4096, \
- BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
+ BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 4096))
BENCH(double, Atan2, LIBC_NAMESPACE::atan2, -1023, 1023);
BENCH(double, Atan2TwoPi, LIBC_NAMESPACE::atan2, -10, 3);
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index a759db2e9d33f..5fe95c3f3b268 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -1,36 +1,28 @@
#include "benchmarks/gpu/LibcGpuBenchmark.h"
-#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/functional.h"
-#include "src/__support/FPUtil/FPBits.h"
+#include "hdr/stdint_proxy.h"
#include "src/math/sin.h"
#include "src/math/sinf.h"
-#include "src/stdlib/rand.h"
#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
#include "platform.h"
#endif
-// BENCHMARK() expects a function that with no parameters that returns a
-// uint64_t representing the latency. Defining each benchmark using macro that
-// expands to a lambda to allow us to switch the implementation of `sin()` to
-// easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \
- []() { \
+#define BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, N) \
+ [](uint32_t call_index) { \
return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range< \
- N>(Func, MIN_EXP, MAX_EXP); \
+ N>(Func, MinExp, MaxExp, call_index); \
}
-#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP) \
+#define BENCH(T, Name, Func, MinExp, MaxExp) \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1, \
- BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1)); \
+ BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 1)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128, \
- BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128)); \
+ BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 128)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024, \
- BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \
+ BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 1024)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096, \
- BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
+ BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 4096))
BENCH(double, Sin, LIBC_NAMESPACE::sin, -1023, 1023);
BENCH(double, SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
>From fc49b8ebf6e4fb8975161c8dd80d7c2ff5f50ad0 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 13 Aug 2025 21:03:27 -0300
Subject: [PATCH 06/10] Conform to LLVM style
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 13769d063e1b9..28a4ebfc6df19 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -220,11 +220,10 @@ benchmark(const BenchmarkOptions &options,
result.samples = samples;
result.total_iterations = estimator.get_iterations();
- if (result.total_iterations > 0) {
+ if (result.total_iterations > 0)
result.total_time = total_time / result.total_iterations;
- } else {
+ else
result.total_time = 0;
- }
return result;
}
>From cfa98380a8cf5be8a3a52a2d0af2ac915255a644 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 13 Aug 2025 21:25:32 -0300
Subject: [PATCH 07/10] Reorder methods in `RefinableRuntimeEstimator`
---
libc/benchmarks/gpu/LibcGpuBenchmark.h | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 21a074467c268..c4088d90f80fa 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -43,6 +43,12 @@ class RefinableRuntimeEstimator {
sum_of_squared_cycles += cycles * cycles;
}
+ void update(const RefinableRuntimeEstimator &other) noexcept {
+ iterations += other.iterations;
+ sum_of_cycles += other.sum_of_cycles;
+ sum_of_squared_cycles += other.sum_of_squared_cycles;
+ }
+
double get_mean() const noexcept {
if (iterations == 0)
return 0.0;
@@ -50,12 +56,6 @@ class RefinableRuntimeEstimator {
return static_cast<double>(sum_of_cycles) / iterations;
}
- void update(const RefinableRuntimeEstimator &other) noexcept {
- iterations += other.iterations;
- sum_of_cycles += other.sum_of_cycles;
- sum_of_squared_cycles += other.sum_of_squared_cycles;
- }
-
double get_variance() const noexcept {
if (iterations == 0)
return 0.0;
>From 96e0bae3602f3abaf399b04779c0b32a79a70057 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 14 Aug 2025 14:52:46 -0300
Subject: [PATCH 08/10] Remove redundant `(void)output;`
---
libc/benchmarks/gpu/timing/amdgpu/timing.h | 16 ++++------------
libc/benchmarks/gpu/timing/nvptx/timing.h | 16 ++++------------
2 files changed, 8 insertions(+), 24 deletions(-)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 90ff9e33c08bb..de721a2d6ce6b 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -115,7 +115,6 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
-
asm("" ::"s"(start));
T result{};
@@ -126,11 +125,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
}
uint64_t stop = gpu::processor_clock();
- cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"s"(stop));
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
volatile auto output = result;
- (void)output;
return stop - start;
}
@@ -146,7 +144,6 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
-
asm("" ::"s"(start));
T result{};
@@ -157,11 +154,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
}
uint64_t stop = gpu::processor_clock();
- cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"s"(stop));
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
volatile auto output = result;
- (void)output;
const uint64_t measured = stop - start;
return measured > baseline ? (measured - baseline) : 0;
@@ -176,7 +172,6 @@ static LIBC_INLINE uint64_t throughput_baseline(
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
-
asm("" ::"s"(start));
T result{};
@@ -189,11 +184,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
}
uint64_t stop = gpu::processor_clock();
- cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"s"(stop));
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
volatile auto output = result;
- (void)output;
return stop - start;
}
@@ -210,7 +204,6 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
-
asm("" ::"s"(start));
T result{};
@@ -223,11 +216,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
}
uint64_t stop = gpu::processor_clock();
- cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"s"(stop));
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
volatile auto output = result;
- (void)output;
const uint64_t measured = stop - start;
return measured > baseline ? (measured - baseline) : 0;
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index e0a069c6c5454..133032ca08423 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -105,7 +105,6 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
-
asm("" ::"llr"(start));
T result{};
@@ -116,11 +115,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
}
uint64_t stop = gpu::processor_clock();
- cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
volatile auto output = result;
- (void)output;
return stop - start;
}
@@ -136,7 +134,6 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
-
asm("" ::"llr"(start));
T result{};
@@ -147,11 +144,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
}
uint64_t stop = gpu::processor_clock();
- cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
volatile auto output = result;
- (void)output;
const uint64_t measured = stop - start;
return measured > baseline ? (measured - baseline) : 0;
@@ -166,7 +162,6 @@ static LIBC_INLINE uint64_t throughput_baseline(
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
-
asm("" ::"llr"(start));
T result{};
@@ -179,11 +174,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
}
uint64_t stop = gpu::processor_clock();
- cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
volatile auto output = result;
- (void)output;
return stop - start;
}
@@ -200,7 +194,6 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
-
asm("" ::"llr"(start));
T result{};
@@ -213,11 +206,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
}
uint64_t stop = gpu::processor_clock();
- cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
volatile auto output = result;
- (void)output;
const uint64_t measured = stop - start;
return measured > baseline ? (measured - baseline) : 0;
>From a11e7754c5fd7cac1f0b36252845a5bada11353b Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 14 Aug 2025 20:08:53 -0300
Subject: [PATCH 09/10] Allow index-less benchmarks via `BenchmarkTarget`
wrapper
---
libc/benchmarks/gpu/CMakeLists.txt | 1 -
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 8 ++--
libc/benchmarks/gpu/LibcGpuBenchmark.h | 44 +++++++++++++++----
libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 2 -
.../gpu/src/ctype/isalnum_benchmark.cpp | 7 ++-
.../gpu/src/ctype/isalpha_benchmark.cpp | 3 +-
6 files changed, 43 insertions(+), 22 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index ce3b0228c2076..beedac78d4826 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -49,7 +49,6 @@ add_unittest_framework_library(
libc.src.__support.CPP.string
libc.src.__support.CPP.string_view
libc.src.__support.CPP.type_traits
- libc.src.__support.CPP.functional
libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.atomic
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 28a4ebfc6df19..93eededd6295a 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,4 +1,5 @@
#include "LibcGpuBenchmark.h"
+
#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
@@ -161,9 +162,8 @@ void Benchmark::run_benchmarks() {
gpu::sync_threads();
}
-BenchmarkResult
-benchmark(const BenchmarkOptions &options,
- const cpp::function<uint64_t(uint32_t)> &wrapper_func) {
+BenchmarkResult benchmark(const BenchmarkOptions &options,
+ const BenchmarkTarget &target) {
BenchmarkResult result;
RuntimeEstimationProgression rep;
uint32_t iterations = options.initial_iterations;
@@ -183,7 +183,7 @@ benchmark(const BenchmarkOptions &options,
const clock_t start = clock();
while (sample_estimator.get_iterations() < iterations) {
- auto current_result = wrapper_func(call_index++);
+ auto current_result = target(call_index++);
max = cpp::max(max, current_result);
min = cpp::min(min, current_result);
sample_estimator.update(current_result);
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index c4088d90f80fa..2ba441c6d1c95 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -6,7 +6,6 @@
#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/CPP/type_traits.h"
@@ -121,21 +120,48 @@ struct BenchmarkResult {
clock_t total_time = 0;
};
-BenchmarkResult
-benchmark(const BenchmarkOptions &options,
- const cpp::function<uint64_t(uint32_t)> &wrapper_func);
+struct BenchmarkTarget {
+ using IndexedFnPtr = uint64_t (*)(uint32_t);
+ using IndexlessFnPtr = uint64_t (*)();
+
+ enum class Kind : uint8_t { Indexed, Indexless } kind;
+ union {
+ IndexedFnPtr indexed_fn_ptr;
+ IndexlessFnPtr indexless_fn_ptr;
+ };
+
+ LIBC_INLINE BenchmarkTarget(IndexedFnPtr func)
+ : kind(Kind::Indexed), indexed_fn_ptr(func) {}
+ LIBC_INLINE BenchmarkTarget(IndexlessFnPtr func)
+ : kind(Kind::Indexless), indexless_fn_ptr(func) {}
+
+ LIBC_INLINE uint64_t operator()([[maybe_unused]] uint32_t call_index) const {
+ return kind == Kind::Indexed ? indexed_fn_ptr(call_index)
+ : indexless_fn_ptr();
+ }
+};
+
+BenchmarkResult benchmark(const BenchmarkOptions &options,
+ const BenchmarkTarget &target);
class Benchmark {
- const cpp::function<uint64_t(uint32_t)> func;
+ const BenchmarkTarget target;
const cpp::string_view suite_name;
const cpp::string_view test_name;
const uint32_t num_threads;
public:
- Benchmark(cpp::function<uint64_t(uint32_t)> func, char const *suite_name,
+ Benchmark(uint64_t (*f)(), const char *suite, const char *test,
+ uint32_t threads)
+ : target(BenchmarkTarget(f)), suite_name(suite), test_name(test),
+ num_threads(threads) {
+ add_benchmark(this);
+ }
+
+ Benchmark(uint64_t (*f)(uint32_t), char const *suite_name,
char const *test_name, uint32_t num_threads)
- : func(func), suite_name(suite_name), test_name(test_name),
- num_threads(num_threads) {
+ : target(BenchmarkTarget(f)), suite_name(suite_name),
+ test_name(test_name), num_threads(num_threads) {
add_benchmark(this);
}
@@ -149,7 +175,7 @@ class Benchmark {
private:
BenchmarkResult run() {
BenchmarkOptions options;
- return benchmark(options, func);
+ return benchmark(options, target);
}
};
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index 77e2bbe538b1f..f277624dbb901 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -7,7 +7,6 @@ add_benchmark(
SRCS
isalnum_benchmark.cpp
DEPENDS
- libc.hdr.stdint_proxy
libc.src.ctype.isalnum
LOADER_ARGS
--threads 64
@@ -20,6 +19,5 @@ add_benchmark(
SRCS
isalpha_benchmark.cpp
DEPENDS
- libc.hdr.stdint_proxy
libc.src.ctype.isalpha
)
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index 28b1ee52c8dfa..ffa5a99860bfc 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -1,9 +1,8 @@
#include "benchmarks/gpu/LibcGpuBenchmark.h"
-#include "hdr/stdint_proxy.h"
#include "src/ctype/isalnum.h"
-uint64_t BM_IsAlnum(uint32_t /*call_index*/) {
+uint64_t BM_IsAlnum() {
char x = 'c';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
}
@@ -13,13 +12,13 @@ SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread,
SINGLE_WAVE_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleWave,
BM_IsAlnum);
-uint64_t BM_IsAlnumCapital(uint32_t /*call_index*/) {
+uint64_t BM_IsAlnumCapital() {
char x = 'A';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
}
BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumCapital, BM_IsAlnumCapital);
-uint64_t BM_IsAlnumNotAlnum(uint32_t /*call_index*/) {
+uint64_t BM_IsAlnumNotAlnum() {
char x = '{';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
}
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
index bff4edea8b690..2038eb89bc77b 100644
--- a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -1,9 +1,8 @@
#include "benchmarks/gpu/LibcGpuBenchmark.h"
-#include "hdr/stdint_proxy.h"
#include "src/ctype/isalpha.h"
-uint64_t BM_IsAlpha(uint32_t /*call_index*/) {
+uint64_t BM_IsAlpha() {
char x = 'c';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
}
>From b6860825e3851226fe7958bfd1239c4f11e03db0 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 15 Aug 2025 00:21:14 -0300
Subject: [PATCH 10/10] Correct statistics aggregation and reporting
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 141 ++++++++++++-----------
libc/benchmarks/gpu/LibcGpuBenchmark.h | 6 +-
2 files changed, 78 insertions(+), 69 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 93eededd6295a..ef816c51a87d7 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -5,6 +5,7 @@
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/string.h"
+#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/fixedvector.h"
@@ -21,37 +22,56 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
benchmarks.push_back(benchmark);
}
+static void atomic_add_double(cpp::Atomic<uint64_t> &atomic_bits,
+ double value) {
+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+
+ uint64_t expected_bits = atomic_bits.load(cpp::MemoryOrder::RELAXED);
+
+ while (true) {
+ double current_value = FPBits(expected_bits).get_val();
+ double next_value = current_value + value;
+
+ uint64_t desired_bits = FPBits(next_value).uintval();
+ if (atomic_bits.compare_exchange_strong(expected_bits, desired_bits,
+ cpp::MemoryOrder::ACQUIRE,
+ cpp::MemoryOrder::RELAXED))
+ break;
+ }
+}
+
struct AtomicBenchmarkSums {
- cpp::Atomic<uint64_t> cycles_sum = 0;
- cpp::Atomic<uint64_t> standard_deviation_sum = 0;
+ cpp::Atomic<uint32_t> active_threads = 0;
+ cpp::Atomic<uint64_t> iterations_sum = 0;
+ cpp::Atomic<uint64_t> weighted_cycles_sum_bits = 0;
+ cpp::Atomic<uint64_t> weighted_squared_cycles_sum_bits = 0;
cpp::Atomic<uint64_t> min = UINT64_MAX;
cpp::Atomic<uint64_t> max = 0;
- cpp::Atomic<uint32_t> samples_sum = 0;
- cpp::Atomic<uint32_t> iterations_sum = 0;
- cpp::Atomic<clock_t> time_sum = 0;
- cpp::Atomic<uint64_t> active_threads = 0;
void reset() {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
active_threads.store(0, cpp::MemoryOrder::RELAXED);
- cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
- standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
+ iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
+ weighted_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
+ weighted_squared_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
max.store(0, cpp::MemoryOrder::RELAXED);
- samples_sum.store(0, cpp::MemoryOrder::RELAXED);
- iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
- time_sum.store(0, cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
}
void update(const BenchmarkResult &result) {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
+ iterations_sum.fetch_add(result.total_iterations,
+ cpp::MemoryOrder::RELAXED);
- cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED);
- standard_deviation_sum.fetch_add(
- static_cast<uint64_t>(result.standard_deviation),
- cpp::MemoryOrder::RELAXED);
+ const double n_i = static_cast<double>(result.total_iterations);
+ const double mean_i = result.cycles;
+ const double stddev_i = result.standard_deviation;
+ const double variance_i = stddev_i * stddev_i;
+ atomic_add_double(weighted_cycles_sum_bits, n_i * mean_i);
+ atomic_add_double(weighted_squared_cycles_sum_bits,
+ n_i * (variance_i + mean_i * mean_i));
// Perform a CAS loop to atomically update the min
uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
@@ -67,10 +87,6 @@ struct AtomicBenchmarkSums {
cpp::MemoryOrder::RELAXED))
;
- samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED);
- iterations_sum.fetch_add(result.total_iterations,
- cpp::MemoryOrder::RELAXED);
- time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
}
};
@@ -80,46 +96,49 @@ constexpr auto GREEN = "\033[32m";
constexpr auto RESET = "\033[0m";
void print_results(Benchmark *b) {
- BenchmarkResult result;
+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+
+ BenchmarkResult final_result;
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
- int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
- result.cycles =
- all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
- result.standard_deviation =
- all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) /
- num_threads;
- result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
- result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
- result.samples =
- all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
- result.total_iterations =
- all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
- const uint64_t duration_ns =
- all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
- const uint64_t duration_us = duration_ns / 1000;
- const uint64_t duration_ms = duration_ns / (1000 * 1000);
- uint64_t converted_duration = duration_ns;
- const char *time_unit;
- if (duration_ms != 0) {
- converted_duration = duration_ms;
- time_unit = "ms";
- } else if (duration_us != 0) {
- converted_duration = duration_us;
- time_unit = "us";
+
+ const uint32_t num_threads =
+ all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
+ final_result.total_iterations =
+ all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED);
+
+ if (final_result.total_iterations > 0) {
+ const uint64_t s1_bits =
+ all_results.weighted_cycles_sum_bits.load(cpp::MemoryOrder::RELAXED);
+ const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits.load(
+ cpp::MemoryOrder::RELAXED);
+
+ const double S1 = FPBits(s1_bits).get_val();
+ const double S2 = FPBits(s2_bits).get_val();
+ const double N = static_cast<double>(final_result.total_iterations);
+
+ const double global_mean = S1 / N;
+ const double global_mean_of_squares = S2 / N;
+ const double global_variance =
+ global_mean_of_squares - (global_mean * global_mean);
+
+ final_result.cycles = global_mean;
+ final_result.standard_deviation =
+ fputil::sqrt<double>(global_variance < 0.0 ? 0.0 : global_variance);
} else {
- converted_duration = duration_ns;
- time_unit = "ns";
+ final_result.cycles = 0.0;
+ final_result.standard_deviation = 0.0;
}
- result.total_time = converted_duration;
- // result.total_time =
- // all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+
+ final_result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
+ final_result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
LIBC_NAMESPACE::printf(
- "%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
- b->get_test_name().data(), result.cycles, result.min, result.max,
- result.total_iterations, result.total_time, time_unit,
- static_cast<uint64_t>(result.standard_deviation), num_threads);
+ "%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
+ b->get_test_name().data(), final_result.cycles,
+ final_result.standard_deviation, (unsigned long long)final_result.min,
+ (unsigned long long)final_result.max,
+ (unsigned long long)final_result.total_iterations, (unsigned)num_threads);
}
void print_header() {
@@ -127,9 +146,8 @@ void print_header() {
LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
benchmarks[0]->get_suite_name().data());
LIBC_NAMESPACE::printf("%s", RESET);
- cpp::string titles =
- "Benchmark | Cycles | Min | Max | "
- "Iterations | Time / Iteration | Stddev | Threads |\n";
+ cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | "
+ " Min | Max | Iterations | Threads |\n";
LIBC_NAMESPACE::printf(titles.data());
cpp::string separator(titles.size(), '-');
@@ -212,18 +230,11 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
}
const auto &estimator = rep.get_estimator();
- result.cycles = static_cast<uint64_t>(estimator.get_mean());
+ result.total_iterations = estimator.get_iterations();
+ result.cycles = estimator.get_mean();
result.standard_deviation = estimator.get_stddev();
-
result.min = min;
result.max = max;
- result.samples = samples;
-
- result.total_iterations = estimator.get_iterations();
- if (result.total_iterations > 0)
- result.total_time = total_time / result.total_iterations;
- else
- result.total_time = 0;
return result;
}
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2ba441c6d1c95..60f69edf86556 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -111,13 +111,11 @@ class RuntimeEstimationProgression {
};
struct BenchmarkResult {
- uint64_t cycles = 0;
+ uint64_t total_iterations = 0;
+ double cycles = 0;
double standard_deviation = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;
- uint32_t samples = 0;
- uint32_t total_iterations = 0;
- clock_t total_time = 0;
};
struct BenchmarkTarget {
More information about the libc-commits
mailing list