[libc-commits] [libc] [libc] Improve GPU benchmarking (PR #153512)

Thu Aug 14 20:21:37 PDT 2025

https://github.com/leandrolcampos updated https://github.com/llvm/llvm-project/pull/153512

>From 6ccc76e04e234f3c85d74c46357b93006bc4f5be Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Sun, 10 Aug 2025 17:54:10 -0300
Subject: [PATCH 01/10] Replace `rand` and `srand` with per-thread RNG for
 reproducibility and fairness

---
 libc/benchmarks/gpu/CMakeLists.txt       |  4 --
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp |  5 +--
 libc/benchmarks/gpu/LibcGpuBenchmark.h   | 51 +++++++++++++++++++-----
 3 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 6ec64bf270b53..ce3b0228c2076 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -22,8 +22,6 @@ function(add_benchmark benchmark_name)
       ${BENCHMARK_LINK_LIBRARIES}
     DEPENDS
       libc.src.stdio.printf
-      libc.src.stdlib.srand
-      libc.src.stdlib.rand
       ${BENCHMARK_DEPENDS}
     ${BENCHMARK_UNPARSED_ARGUMENTS}
     COMPILE_OPTIONS
@@ -64,8 +62,6 @@ add_unittest_framework_library(
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.fixedvector
     libc.src.time.clock
-    libc.src.stdlib.rand
-    libc.src.stdlib.srand
     libc.benchmarks.gpu.timing.timing
     libc.src.stdio.printf
 )
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 57ff5b9fdb846..4009a18c475cb 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -9,7 +9,6 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"
 #include "src/stdio/printf.h"
-#include "src/stdlib/srand.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace benchmarks {
@@ -139,10 +138,8 @@ void print_header() {
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
 
-  if (id == 0) {
+  if (id == 0)
     print_header();
-    LIBC_NAMESPACE::srand(gpu::processor_clock());
-  }
 
   gpu::sync_threads();
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index a6cf62dd30ce5..52c5bb1cabe34 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -11,7 +11,6 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
-#include "src/stdlib/rand.h"
 #include "src/time/clock.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -109,6 +108,35 @@ class Benchmark {
   }
 };
 
+class RandomGenerator {
+  uint64_t state;
+
+  static inline uint64_t splitmix64(uint64_t x) noexcept {
+    x += 0x9E3779B97F4A7C15ULL;
+    x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
+    x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
+    x = (x ^ (x >> 31));
+    return x ? x : 0x9E3779B97F4A7C15ULL;
+  }
+
+public:
+  explicit inline RandomGenerator(uint64_t seed) noexcept
+      : state(splitmix64(seed)) {}
+
+  inline uint64_t next64() noexcept {
+    uint64_t x = state;
+    x ^= x >> 12;
+    x ^= x << 25;
+    x ^= x >> 27;
+    state = x;
+    return x * 0x2545F4914F6CDD1DULL;
+  }
+
+  inline uint32_t next32() noexcept {
+    return static_cast<uint32_t>(next64() >> 32);
+  }
+};
+
 // We want our random values to be approximately
 // Output: a random number with the exponent field between min_exp and max_exp,
 // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
@@ -117,7 +145,8 @@ class Benchmark {
 //   EXP_BIAS + 1 corresponding to inf or nan.
 template <typename T>
 static T
-get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+get_rand_input(RandomGenerator &rng,
+               int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
                int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
 
@@ -126,10 +155,9 @@ get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
                                                uint64_t, uint32_t>;
   RandType bits;
   if constexpr (cpp::is_same_v<T, uint64_t>)
-    bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
-           static_cast<uint64_t>(LIBC_NAMESPACE::rand());
+    bits = rng.next64();
   else
-    bits = LIBC_NAMESPACE::rand();
+    bits = rng.next32();
   double scale =
       static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
   FPBits fp(bits);
@@ -146,10 +174,12 @@ template <typename T> class MathPerf {
 
 public:
   template <size_t N = 1>
-  static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
+  static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
+                                          uint64_t seed = N) {
     cpp::array<T, N> inputs;
+    RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
     for (size_t i = 0; i < N; ++i)
-      inputs[i] = get_rand_input<T>(min_exp, max_exp);
+      inputs[i] = get_rand_input<T>(rng, min_exp, max_exp);
 
     uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
 
@@ -160,12 +190,13 @@ template <typename T> class MathPerf {
   template <size_t N = 1>
   static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
                                           int arg1_max_exp, int arg2_min_exp,
-                                          int arg2_max_exp) {
+                                          int arg2_max_exp, uint64_t seed = N) {
     cpp::array<T, N> inputs1;
     cpp::array<T, N> inputs2;
+    RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
     for (size_t i = 0; i < N; ++i) {
-      inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
-      inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
+      inputs1[i] = get_rand_input<T>(rng, arg1_min_exp, arg1_max_exp);
+      inputs2[i] = get_rand_input<T>(rng, arg2_min_exp, arg2_max_exp);
     }
 
     uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);

>From f36f86fd807b1f6efba6b32f7d8fec3f3c5fc4dd Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Sun, 10 Aug 2025 23:25:03 -0300
Subject: [PATCH 02/10] Fix random input generation

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      |  12 +-
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 123 ++++++++++++------
 libc/benchmarks/gpu/src/ctype/CMakeLists.txt  |   2 +
 .../gpu/src/ctype/isalnum_benchmark.cpp       |   7 +-
 .../gpu/src/ctype/isalpha_benchmark.cpp       |   3 +-
 5 files changed, 98 insertions(+), 49 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 4009a18c475cb..c42de2ada8704 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,4 +1,5 @@
 #include "LibcGpuBenchmark.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
@@ -160,8 +161,9 @@ void Benchmark::run_benchmarks() {
   gpu::sync_threads();
 }
 
-BenchmarkResult benchmark(const BenchmarkOptions &options,
-                          cpp::function<uint64_t(void)> wrapper_func) {
+BenchmarkResult
+benchmark(const BenchmarkOptions &options,
+          const cpp::function<uint64_t(uint32_t)> &wrapper_func) {
   BenchmarkResult result;
   RuntimeEstimationProgression rep;
   uint32_t total_iterations = 0;
@@ -181,11 +183,13 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
   for (int i = 0; i < overhead_iterations; i++)
     overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
 
+  uint32_t call_index = 0;
+
   for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
     uint64_t sample_cycles = 0;
     const clock_t start = static_cast<double>(clock());
     for (uint32_t i = 0; i < iterations; i++) {
-      auto wrapper_intermediate = wrapper_func();
+      auto wrapper_intermediate = wrapper_func(call_index++);
       uint64_t current_result = wrapper_intermediate - overhead;
       max = cpp::max(max, current_result);
       min = cpp::min(min, current_result);
@@ -223,7 +227,7 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
   result.total_iterations = total_iterations;
   result.total_time = total_time / total_iterations;
   return result;
-};
+}
 
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 52c5bb1cabe34..2a8cef6eff190 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -4,6 +4,7 @@
 #include "benchmarks/gpu/BenchmarkLogger.h"
 #include "benchmarks/gpu/timing/timing.h"
 #include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
@@ -77,17 +78,18 @@ struct BenchmarkResult {
   clock_t total_time = 0;
 };
 
-BenchmarkResult benchmark(const BenchmarkOptions &options,
-                          cpp::function<uint64_t(void)> wrapper_func);
+BenchmarkResult
+benchmark(const BenchmarkOptions &options,
+          const cpp::function<uint64_t(uint32_t)> &wrapper_func);
 
 class Benchmark {
-  const cpp::function<uint64_t(void)> func;
+  const cpp::function<uint64_t(uint32_t)> func;
   const cpp::string_view suite_name;
   const cpp::string_view test_name;
   const uint32_t num_threads;
 
 public:
-  Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
+  Benchmark(cpp::function<uint64_t(uint32_t)> func, char const *suite_name,
             char const *test_name, uint32_t num_threads)
       : func(func), suite_name(suite_name), test_name(test_name),
         num_threads(num_threads) {
@@ -111,7 +113,7 @@ class Benchmark {
 class RandomGenerator {
   uint64_t state;
 
-  static inline uint64_t splitmix64(uint64_t x) noexcept {
+  static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept {
     x += 0x9E3779B97F4A7C15ULL;
     x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
     x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
@@ -120,10 +122,10 @@ class RandomGenerator {
   }
 
 public:
-  explicit inline RandomGenerator(uint64_t seed) noexcept
+  explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept
       : state(splitmix64(seed)) {}
 
-  inline uint64_t next64() noexcept {
+  LIBC_INLINE uint64_t next64() noexcept {
     uint64_t x = state;
     x ^= x >> 12;
     x ^= x << 25;
@@ -132,52 +134,86 @@ class RandomGenerator {
     return x * 0x2545F4914F6CDD1DULL;
   }
 
-  inline uint32_t next32() noexcept {
+  LIBC_INLINE uint32_t next32() noexcept {
     return static_cast<uint32_t>(next64() >> 32);
   }
 };
 
-// We want our random values to be approximately
-// Output: a random number with the exponent field between min_exp and max_exp,
-// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
-// Caveats:
-//   -EXP_BIAS corresponding to denormal values,
-//   EXP_BIAS + 1 corresponding to inf or nan.
+// We want random floating-point values whose *unbiased* exponent e is
+// approximately uniform in [min_exp, max_exp]. That is,
+//   2^min_exp <= |value| < 2^(max_exp + 1).
+// Caveats / boundaries:
+// - e = -EXP_BIAS  ==> subnormal range (biased exponent = 0). We ensure a
+//                      non-zero mantissa so we don't accidentally produce 0.
+// - e in [1 - EXP_BIAS, EXP_BIAS] ==> normal numbers.
+// - e = EXP_BIAS + 1 ==> Inf/NaN. We do not include it by default; max_exp
+//                        defaults to EXP_BIAS.
 template <typename T>
 static T
 get_rand_input(RandomGenerator &rng,
-               int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
-               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
+               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+               int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-
-  // Required to correctly instantiate FPBits for floats and doubles.
-  using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
-                                               uint64_t, uint32_t>;
-  RandType bits;
-  if constexpr (cpp::is_same_v<T, uint64_t>)
-    bits = rng.next64();
-  else
-    bits = rng.next32();
-  double scale =
-      static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
-  FPBits fp(bits);
-  fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
-  return fp.get_val();
+  using Storage = typename FPBits::StorageType;
+
+  // Sanitize and clamp requested range to what the format supports
+  if (min_exp > max_exp) {
+    auto tmp = min_exp;
+    min_exp = max_exp;
+    max_exp = tmp;
+  };
+  min_exp = cpp::max(min_exp, -FPBits::EXP_BIAS);
+  max_exp = cpp::min(max_exp, FPBits::EXP_BIAS);
+
+  // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo
+  // bias
+  auto sample_in_range = [&](uint64_t r) -> int32_t {
+    const uint64_t range = static_cast<uint64_t>(
+        static_cast<int64_t>(max_exp) - static_cast<int64_t>(min_exp) + 1);
+    const uint64_t threshold = (-range) % range;
+    while (r < threshold)
+      r = rng.next64();
+    return static_cast<int32_t>(min_exp + static_cast<int64_t>(r % range));
+  };
+  const int32_t e = sample_in_range(rng.next64());
+
+  // Start from random bits to get random sign and mantissa
+  FPBits xbits([&] {
+    if constexpr (cpp::is_same_v<T, double>)
+      return FPBits(rng.next64());
+    else
+      return FPBits(rng.next32());
+  }());
+
+  if (e == -FPBits::EXP_BIAS) {
+    // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0
+    xbits.set_biased_exponent(Storage(0));
+    if (xbits.get_mantissa() == Storage(0))
+      xbits.set_mantissa(Storage(1));
+  } else {
+    // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS]
+    const int32_t biased = e + FPBits::EXP_BIAS;
+    xbits.set_biased_exponent(static_cast<Storage>(biased));
+  }
+  return xbits.get_val();
 }
 
 template <typename T> class MathPerf {
-  using FPBits = fputil::FPBits<T>;
-  using StorageType = typename FPBits::StorageType;
-  static constexpr StorageType UIntMax =
-      cpp::numeric_limits<StorageType>::max();
+  static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) {
+    const uint64_t tid = gpu::get_thread_id();
+    return base_seed ^ (salt << 32) ^ (tid * 0x9E3779B97F4A7C15ULL);
+  }
 
 public:
   template <size_t N = 1>
   static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
-                                          uint64_t seed = N) {
+                                          uint32_t call_index) {
     cpp::array<T, N> inputs;
-    RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
+
+    uint64_t base_seed = static_cast<uint64_t>(call_index);
+    uint64_t salt = static_cast<uint64_t>(N);
+    RandomGenerator rng(make_seed(base_seed, salt));
+
     for (size_t i = 0; i < N; ++i)
       inputs[i] = get_rand_input<T>(rng, min_exp, max_exp);
 
@@ -186,14 +222,18 @@ template <typename T> class MathPerf {
     return total_time / N;
   }
 
-  // Throughput benchmarking for functions that take 2 inputs.
   template <size_t N = 1>
   static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
                                           int arg1_max_exp, int arg2_min_exp,
-                                          int arg2_max_exp, uint64_t seed = N) {
+                                          int arg2_max_exp,
+                                          uint32_t call_index) {
     cpp::array<T, N> inputs1;
     cpp::array<T, N> inputs2;
-    RandomGenerator rng((seed << 32) ^ gpu::get_thread_id());
+
+    uint64_t base_seed = static_cast<uint64_t>(call_index);
+    uint64_t salt = static_cast<uint64_t>(N);
+    RandomGenerator rng(make_seed(base_seed, salt));
+
     for (size_t i = 0; i < N; ++i) {
       inputs1[i] = get_rand_input<T>(rng, arg1_min_exp, arg1_max_exp);
       inputs2[i] = get_rand_input<T>(rng, arg2_min_exp, arg2_max_exp);
@@ -224,4 +264,5 @@ template <typename T> class MathPerf {
 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
   BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
                       LIBC_NAMESPACE::gpu::get_lane_size())
-#endif
+
+#endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index f277624dbb901..77e2bbe538b1f 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -7,6 +7,7 @@ add_benchmark(
   SRCS
     isalnum_benchmark.cpp
   DEPENDS
+    libc.hdr.stdint_proxy
     libc.src.ctype.isalnum
   LOADER_ARGS
     --threads 64
@@ -19,5 +20,6 @@ add_benchmark(
   SRCS
     isalpha_benchmark.cpp
   DEPENDS
+    libc.hdr.stdint_proxy
     libc.src.ctype.isalpha
 )
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index ffa5a99860bfc..28b1ee52c8dfa 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -1,8 +1,9 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
+#include "hdr/stdint_proxy.h"
 #include "src/ctype/isalnum.h"
 
-uint64_t BM_IsAlnum() {
+uint64_t BM_IsAlnum(uint32_t /*call_index*/) {
   char x = 'c';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
@@ -12,13 +13,13 @@ SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread,
 SINGLE_WAVE_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleWave,
                       BM_IsAlnum);
 
-uint64_t BM_IsAlnumCapital() {
+uint64_t BM_IsAlnumCapital(uint32_t /*call_index*/) {
   char x = 'A';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
 BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumCapital, BM_IsAlnumCapital);
 
-uint64_t BM_IsAlnumNotAlnum() {
+uint64_t BM_IsAlnumNotAlnum(uint32_t /*call_index*/) {
   char x = '{';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
index 2038eb89bc77b..bff4edea8b690 100644
--- a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -1,8 +1,9 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
+#include "hdr/stdint_proxy.h"
 #include "src/ctype/isalpha.h"
 
-uint64_t BM_IsAlpha() {
+uint64_t BM_IsAlpha(uint32_t /*call_index*/) {
   char x = 'c';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
 }

>From aa6a03d928923a85c79cb76683d8c022e27c0261 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Tue, 12 Aug 2025 23:10:36 -0300
Subject: [PATCH 03/10] Fix standard deviation

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 47 ++++++++------
 libc/benchmarks/gpu/LibcGpuBenchmark.h   | 83 ++++++++++++++++++------
 2 files changed, 90 insertions(+), 40 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index c42de2ada8704..3c1eef22414a5 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -166,15 +166,13 @@ benchmark(const BenchmarkOptions &options,
           const cpp::function<uint64_t(uint32_t)> &wrapper_func) {
   BenchmarkResult result;
   RuntimeEstimationProgression rep;
-  uint32_t total_iterations = 0;
   uint32_t iterations = options.initial_iterations;
+
   if (iterations < 1u)
     iterations = 1;
 
   uint32_t samples = 0;
   uint64_t total_time = 0;
-  uint64_t best_guess = 0;
-  uint64_t cycles_squared = 0;
   uint64_t min = UINT64_MAX;
   uint64_t max = 0;
 
@@ -186,46 +184,55 @@ benchmark(const BenchmarkOptions &options,
   uint32_t call_index = 0;
 
   for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
-    uint64_t sample_cycles = 0;
-    const clock_t start = static_cast<double>(clock());
-    for (uint32_t i = 0; i < iterations; i++) {
+    RefinableRuntimeEstimator sample_estimator;
+
+    const clock_t start = clock();
+    while (sample_estimator.get_iterations() < iterations) {
       auto wrapper_intermediate = wrapper_func(call_index++);
-      uint64_t current_result = wrapper_intermediate - overhead;
+      uint64_t current_result =
+          wrapper_intermediate < overhead ? 0 : wrapper_intermediate - overhead;
       max = cpp::max(max, current_result);
       min = cpp::min(min, current_result);
-      sample_cycles += current_result;
+      sample_estimator.update(current_result);
     }
     const clock_t end = clock();
+
     const clock_t duration_ns =
         ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
     total_time += duration_ns;
     time_budget -= duration_ns;
     samples++;
-    cycles_squared += sample_cycles * sample_cycles;
 
-    total_iterations += iterations;
-    const double change_ratio =
-        rep.compute_improvement({iterations, sample_cycles});
-    best_guess = rep.current_estimation;
+    const double change_ratio = rep.compute_improvement(sample_estimator);
 
     if (samples >= options.max_samples || iterations >= options.max_iterations)
       break;
+
+    const auto total_iterations = rep.get_estimator().get_iterations();
+
     if (total_time >= options.min_duration && samples >= options.min_samples &&
         total_iterations >= options.min_iterations &&
         change_ratio < options.epsilon)
       break;
 
-    iterations *= options.scaling_factor;
+    iterations = static_cast<uint32_t>(iterations * options.scaling_factor);
   }
-  result.cycles = best_guess;
-  result.standard_deviation = fputil::sqrt<double>(
-      static_cast<double>(cycles_squared) / total_iterations -
-      static_cast<double>(best_guess * best_guess));
+
+  const auto &estimator = rep.get_estimator();
+  result.cycles = static_cast<uint64_t>(estimator.get_mean());
+  result.standard_deviation = estimator.get_stddev();
+
   result.min = min;
   result.max = max;
   result.samples = samples;
-  result.total_iterations = total_iterations;
-  result.total_time = total_time / total_iterations;
+
+  result.total_iterations = estimator.get_iterations();
+  if (result.total_iterations > 0) {
+    result.total_time = total_time / result.total_iterations;
+  } else {
+    result.total_time = 0;
+  }
+
   return result;
 }
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2a8cef6eff190..96f3433fae77c 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -11,6 +11,7 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/sqrt.h"
 #include "src/__support/macros/config.h"
 #include "src/time/clock.h"
 
@@ -30,40 +31,82 @@ struct BenchmarkOptions {
   double scaling_factor = 1.4;
 };
 
-struct Measurement {
+class RefinableRuntimeEstimator {
   uint32_t iterations = 0;
-  uint64_t elapsed_cycles = 0;
-};
-
-class RefinableRuntimeEstimation {
-  uint64_t total_cycles = 0;
-  uint32_t total_iterations = 0;
+  uint64_t sum_of_cycles = 0;
+  uint64_t sum_of_squared_cycles = 0;
 
 public:
-  uint64_t update(const Measurement &M) {
-    total_cycles += M.elapsed_cycles;
-    total_iterations += M.iterations;
-    return total_cycles / total_iterations;
+  void update(uint64_t cycles) noexcept {
+    iterations += 1;
+    sum_of_cycles += cycles;
+    sum_of_squared_cycles += cycles * cycles;
+  }
+
+  double get_mean() const noexcept {
+    if (iterations == 0)
+      return 0.0;
+
+    return static_cast<double>(sum_of_cycles) / iterations;
+  }
+
+  void update(const RefinableRuntimeEstimator &other) noexcept {
+    iterations += other.iterations;
+    sum_of_cycles += other.sum_of_cycles;
+    sum_of_squared_cycles += other.sum_of_squared_cycles;
+  }
+
+  double get_variance() const noexcept {
+    if (iterations == 0)
+      return 0.0;
+
+    const double num = static_cast<double>(iterations);
+    const double sum_x = static_cast<double>(sum_of_cycles);
+    const double sum_x2 = static_cast<double>(sum_of_squared_cycles);
+
+    const double mean_of_squares = sum_x2 / num;
+    const double mean = sum_x / num;
+    const double mean_squared = mean * mean;
+    const double variance = mean_of_squares - mean_squared;
+
+    return variance < 0.0 ? 0.0 : variance;
+  }
+
+  double get_stddev() const noexcept {
+    return fputil::sqrt<double>(get_variance());
   }
+
+  uint32_t get_iterations() const noexcept { return iterations; }
 };
 
 // Tracks the progression of the runtime estimation
 class RuntimeEstimationProgression {
-  RefinableRuntimeEstimation rre;
+  RefinableRuntimeEstimator estimator;
+  double current_mean = 0.0;
 
 public:
-  uint64_t current_estimation = 0;
+  const RefinableRuntimeEstimator &get_estimator() const noexcept {
+    return estimator;
+  }
+
+  double
+  compute_improvement(const RefinableRuntimeEstimator &sample_estimator) {
+    if (sample_estimator.get_iterations() == 0)
+      return 1.0;
 
-  double compute_improvement(const Measurement &M) {
-    const uint64_t new_estimation = rre.update(M);
-    double ratio =
-        (static_cast<double>(current_estimation) / new_estimation) - 1.0;
+    estimator.update(sample_estimator);
+
+    const double new_mean = estimator.get_mean();
+    if (current_mean == 0.0 || new_mean == 0.0) {
+      current_mean = new_mean;
+      return 1.0;
+    }
 
-    // Get absolute value
+    double ratio = (current_mean / new_mean) - 1.0;
     if (ratio < 0)
-      ratio *= -1;
+      ratio = -ratio;
 
-    current_estimation = new_estimation;
+    current_mean = new_mean;
     return ratio;
   }
 };

>From 6e60a3d946f25946162e1d974bb345fe4c59821d Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 13 Aug 2025 15:38:44 -0300
Subject: [PATCH 04/10] Fix throughput overhead

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      |   9 +-
 libc/benchmarks/gpu/LibcGpuBenchmark.h        |   2 +
 .../gpu/timing/amdgpu/CMakeLists.txt          |   3 +-
 libc/benchmarks/gpu/timing/amdgpu/timing.h    | 106 +++++++++++++++---
 .../gpu/timing/nvptx/CMakeLists.txt           |   3 +-
 libc/benchmarks/gpu/timing/nvptx/timing.h     | 100 +++++++++++++++--
 6 files changed, 187 insertions(+), 36 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 3c1eef22414a5..13769d063e1b9 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -176,11 +176,6 @@ benchmark(const BenchmarkOptions &options,
   uint64_t min = UINT64_MAX;
   uint64_t max = 0;
 
-  uint64_t overhead = UINT64_MAX;
-  int overhead_iterations = 10;
-  for (int i = 0; i < overhead_iterations; i++)
-    overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
-
   uint32_t call_index = 0;
 
   for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
@@ -188,9 +183,7 @@ benchmark(const BenchmarkOptions &options,
 
     const clock_t start = clock();
     while (sample_estimator.get_iterations() < iterations) {
-      auto wrapper_intermediate = wrapper_func(call_index++);
-      uint64_t current_result =
-          wrapper_intermediate < overhead ? 0 : wrapper_intermediate - overhead;
+      auto current_result = wrapper_func(call_index++);
       max = cpp::max(max, current_result);
       min = cpp::min(min, current_result);
       sample_estimator.update(current_result);
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 96f3433fae77c..21a074467c268 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -248,6 +248,7 @@ template <typename T> class MathPerf {
   }
 
 public:
+  // Returns cycles-per-call (lower is better)
   template <size_t N = 1>
   static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
                                           uint32_t call_index) {
@@ -265,6 +266,7 @@ template <typename T> class MathPerf {
     return total_time / N;
   }
 
+  // Returns cycles-per-call (lower is better)
   template <size_t N = 1>
   static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
                                           int arg1_max_exp, int arg2_min_exp,
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index dd7c2d342f70f..d6a89d04dab97 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -7,6 +7,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.macros.config
     libc.src.__support.macros.attributes
-    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.array
+    libc.src.__support.CPP.type_traits
 )
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 37dbb9af5976b..90ff9e33c08bb 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
 #include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
@@ -105,10 +106,11 @@ template <typename F, typename T1, typename T2>
   return stop - start;
 }
 
-// Provides throughput benchmarking.
-template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t
-throughput(F f, const cpp::array<T, N> &inputs) {
+// Provides the *baseline* for throughput: measures loop and measurement costs
+// without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t
+throughput_baseline(const cpp::array<T, N> &inputs) {
   asm("" ::"v"(&inputs));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
@@ -116,24 +118,94 @@ throughput(F f, const cpp::array<T, N> &inputs) {
 
   asm("" ::"s"(start));
 
+  T result{};
   for (auto input : inputs) {
-    auto result = f(input);
+    asm("" ::"v"(input));
+    result = input;
+    asm("" ::"v"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  asm("" ::"s"(stop));
+
+  volatile auto output = result;
+  (void)output;
+
+  return stop - start;
+}
+
+// Provides throughput benchmarking
+template <typename F, typename T, size_t N>
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs));
+
+  asm("" ::"v"(&inputs));
+
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
 
+  asm("" ::"s"(start));
+
+  T result{};
+  for (auto input : inputs) {
+    asm("" ::"v"(input));
+    result = f(input);
     asm("" ::"v"(result));
   }
 
   uint64_t stop = gpu::processor_clock();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"s"(stop));
+
+  volatile auto output = result;
+  (void)output;
+
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
+}
+
+// Provides the *baseline* for throughput with 2 arguments: measures loop and
+// measurement costs without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t throughput_baseline(
+    const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+  asm("" ::"v"(&inputs1), "v"(&inputs2));
+
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"s"(start));
+
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"v"(x), "v"(y));
+    result = x;
+    asm("" ::"v"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  asm("" ::"s"(stop));
+
+  volatile auto output = result;
+  (void)output;
 
-  // Return the time elapsed.
   return stop - start;
 }
 
 // Provides throughput benchmarking for 2 arguments (e.g. atan2())
 template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
-    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
+                                       const cpp::array<T, N> &inputs2) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs1, inputs2));
+
   asm("" ::"v"(&inputs1), "v"(&inputs2));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
@@ -141,18 +213,24 @@ template <typename F, typename T, size_t N>
 
   asm("" ::"s"(start));
 
-  for (size_t i = 0; i < inputs1.size(); i++) {
-    auto result = f(inputs1[i], inputs2[i]);
-
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"v"(x), "v"(y));
+    result = f(x, y);
     asm("" ::"v"(result));
   }
 
   uint64_t stop = gpu::processor_clock();
-  asm("" ::"s"(stop));
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  asm("" ::"s"(stop));
 
-  // Return the time elapsed.
-  return stop - start;
+  volatile auto output = result;
+  (void)output;
+
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index a19c16ee4e44d..801080e7a6e98 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -7,6 +7,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.macros.config
     libc.src.__support.macros.attributes
-    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.array
+    libc.src.__support.CPP.type_traits
 )
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 3c729636367aa..e0a069c6c5454 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 
 #include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
@@ -95,10 +96,42 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   return stop - start;
 }
 
-// Provides throughput benchmarking.
+// Provides the *baseline* for throughput: measures loop and measurement costs
+// without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t
+throughput_baseline(const cpp::array<T, N> &inputs) {
+  asm("" ::"r"(&inputs));
+
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"llr"(start));
+
+  T result{};
+  for (auto input : inputs) {
+    asm("" ::"r"(input));
+    result = input;
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  asm("" ::"r"(stop));
+
+  volatile auto output = result;
+  (void)output;
+
+  return stop - start;
+}
+
+// Provides throughput benchmarking
 template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t
-throughput(F f, const cpp::array<T, N> &inputs) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs));
+
   asm("" ::"r"(&inputs));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
@@ -106,7 +139,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
 
   asm("" ::"llr"(start));
 
-  uint64_t result;
+  T result{};
   for (auto input : inputs) {
     asm("" ::"r"(input));
     result = f(input);
@@ -116,16 +149,53 @@ throughput(F f, const cpp::array<T, N> &inputs) {
   uint64_t stop = gpu::processor_clock();
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
+
   volatile auto output = result;
+  (void)output;
+
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
+}
+
+// Provides the *baseline* for throughput with 2 arguments: measures loop and
+// measurement costs without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t throughput_baseline(
+    const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+  asm("" ::"r"(&inputs1), "r"(&inputs2));
+
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"llr"(start));
+
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"r"(x), "r"(y));
+    result = x;
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  asm("" ::"r"(stop));
+
+  volatile auto output = result;
+  (void)output;
 
-  // Return the time elapsed.
   return stop - start;
 }
 
 // Provides throughput benchmarking for 2 arguments (e.g. atan2())
 template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
-    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
+                                       const cpp::array<T, N> &inputs2) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs1, inputs2));
+
   asm("" ::"r"(&inputs1), "r"(&inputs2));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
@@ -133,20 +203,26 @@ template <typename F, typename T, size_t N>
 
   asm("" ::"llr"(start));
 
-  uint64_t result;
-  for (size_t i = 0; i < inputs1.size(); i++) {
-    result = f(inputs1[i], inputs2[i]);
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"r"(x), "r"(y));
+    result = f(x, y);
     asm("" ::"r"(result));
   }
 
   uint64_t stop = gpu::processor_clock();
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
+
   volatile auto output = result;
+  (void)output;
 
-  // Return the time elapsed.
-  return stop - start;
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
 }
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

>From dc7436f9842a42d7e6cc2328681a57cc3c16320f Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 13 Aug 2025 20:38:58 -0300
Subject: [PATCH 05/10] Adapt math benchmarks

---
 libc/benchmarks/gpu/src/math/CMakeLists.txt   | 10 -------
 .../gpu/src/math/atan2_benchmark.cpp          | 18 ++++++-------
 .../benchmarks/gpu/src/math/sin_benchmark.cpp | 26 +++++++------------
 3 files changed, 18 insertions(+), 36 deletions(-)

diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 7a12ce4e61c9e..8417f23c124a0 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -34,11 +34,6 @@ add_benchmark(
     libc.hdr.stdint_proxy
     libc.src.math.sin
     libc.src.math.sinf
-    libc.src.stdlib.srand
-    libc.src.stdlib.rand
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.array
   COMPILE_OPTIONS
     ${math_benchmark_flags}
   LOADER_ARGS
@@ -54,11 +49,6 @@ add_benchmark(
   DEPENDS
     libc.hdr.stdint_proxy
     libc.src.math.atan2
-    libc.src.stdlib.srand
-    libc.src.stdlib.rand
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.array
   COMPILE_OPTIONS
     ${math_benchmark_flags}
   LOADER_ARGS
diff --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
index 1f91a9a35c373..82bb0c5d7de49 100644
--- a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
@@ -1,27 +1,27 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
+#include "hdr/stdint_proxy.h"
 #include "src/math/atan2.h"
-#include "src/stdlib/rand.h"
 
 #if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
 #include "platform.h"
 #endif
 
-#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                      \
-  []() {                                                                       \
+#define BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, N)                           \
+  [](uint32_t call_index) {                                                    \
     return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range<   \
-        N>(Func, MIN_EXP, MAX_EXP, MIN_EXP, MAX_EXP);                          \
+        N>(Func, MinExp, MaxExp, MinExp, MaxExp, call_index);                  \
   }
 
-#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP)                                 \
+#define BENCH(T, Name, Func, MinExp, MaxExp)                                   \
   SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1,                   \
-                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1));    \
+                        BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 1));         \
   SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_128,                 \
-                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128));  \
+                        BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 128));       \
   SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1024,                \
-                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \
+                        BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 1024));      \
   SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_4096,                \
-                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
+                        BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 4096))
 
 BENCH(double, Atan2, LIBC_NAMESPACE::atan2, -1023, 1023);
 BENCH(double, Atan2TwoPi, LIBC_NAMESPACE::atan2, -10, 3);
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index a759db2e9d33f..5fe95c3f3b268 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -1,36 +1,28 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
-#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/functional.h"
-#include "src/__support/FPUtil/FPBits.h"
+#include "hdr/stdint_proxy.h"
 #include "src/math/sin.h"
 #include "src/math/sinf.h"
-#include "src/stdlib/rand.h"
 
 #if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
 #include "platform.h"
 #endif
 
-// BENCHMARK() expects a function that with no parameters that returns a
-// uint64_t representing the latency. Defining each benchmark using macro that
-// expands to a lambda to allow us to switch the implementation of `sin()` to
-// easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                          \
-  []() {                                                                       \
+#define BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, N)                            \
+  [](uint32_t call_index) {                                                    \
     return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range<   \
-        N>(Func, MIN_EXP, MAX_EXP);                                            \
+        N>(Func, MinExp, MaxExp, call_index);                                  \
   }
 
-#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP)                                 \
+#define BENCH(T, Name, Func, MinExp, MaxExp)                                   \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1,                     \
-                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1));        \
+                        BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 1));          \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128,                   \
-                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128));      \
+                        BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 128));        \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024,                  \
-                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024));     \
+                        BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 1024));       \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096,                  \
-                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
+                        BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 4096))
 
 BENCH(double, Sin, LIBC_NAMESPACE::sin, -1023, 1023);
 BENCH(double, SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);

>From fc49b8ebf6e4fb8975161c8dd80d7c2ff5f50ad0 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 13 Aug 2025 21:03:27 -0300
Subject: [PATCH 06/10] Conform to LLVM style

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 13769d063e1b9..28a4ebfc6df19 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -220,11 +220,10 @@ benchmark(const BenchmarkOptions &options,
   result.samples = samples;
 
   result.total_iterations = estimator.get_iterations();
-  if (result.total_iterations > 0) {
+  if (result.total_iterations > 0)
     result.total_time = total_time / result.total_iterations;
-  } else {
+  else
     result.total_time = 0;
-  }
 
   return result;
 }

>From cfa98380a8cf5be8a3a52a2d0af2ac915255a644 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 13 Aug 2025 21:25:32 -0300
Subject: [PATCH 07/10] Reorder methods in `RefinableRuntimeEstimator`

---
 libc/benchmarks/gpu/LibcGpuBenchmark.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 21a074467c268..c4088d90f80fa 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -43,6 +43,12 @@ class RefinableRuntimeEstimator {
     sum_of_squared_cycles += cycles * cycles;
   }
 
+  void update(const RefinableRuntimeEstimator &other) noexcept {
+    iterations += other.iterations;
+    sum_of_cycles += other.sum_of_cycles;
+    sum_of_squared_cycles += other.sum_of_squared_cycles;
+  }
+
   double get_mean() const noexcept {
     if (iterations == 0)
       return 0.0;
@@ -50,12 +56,6 @@ class RefinableRuntimeEstimator {
     return static_cast<double>(sum_of_cycles) / iterations;
   }
 
-  void update(const RefinableRuntimeEstimator &other) noexcept {
-    iterations += other.iterations;
-    sum_of_cycles += other.sum_of_cycles;
-    sum_of_squared_cycles += other.sum_of_squared_cycles;
-  }
-
   double get_variance() const noexcept {
     if (iterations == 0)
       return 0.0;

>From 96e0bae3602f3abaf399b04779c0b32a79a70057 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 14 Aug 2025 14:52:46 -0300
Subject: [PATCH 08/10] Remove redundant `(void)output;`

---
 libc/benchmarks/gpu/timing/amdgpu/timing.h | 16 ++++------------
 libc/benchmarks/gpu/timing/nvptx/timing.h  | 16 ++++------------
 2 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 90ff9e33c08bb..de721a2d6ce6b 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -115,7 +115,6 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
-
   asm("" ::"s"(start));
 
   T result{};
@@ -126,11 +125,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
   }
 
   uint64_t stop = gpu::processor_clock();
-  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"s"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   volatile auto output = result;
-  (void)output;
 
   return stop - start;
 }
@@ -146,7 +144,6 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
-
   asm("" ::"s"(start));
 
   T result{};
@@ -157,11 +154,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
   }
 
   uint64_t stop = gpu::processor_clock();
-  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"s"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   volatile auto output = result;
-  (void)output;
 
   const uint64_t measured = stop - start;
   return measured > baseline ? (measured - baseline) : 0;
@@ -176,7 +172,6 @@ static LIBC_INLINE uint64_t throughput_baseline(
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
-
   asm("" ::"s"(start));
 
   T result{};
@@ -189,11 +184,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
   }
 
   uint64_t stop = gpu::processor_clock();
-  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"s"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   volatile auto output = result;
-  (void)output;
 
   return stop - start;
 }
@@ -210,7 +204,6 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
-
   asm("" ::"s"(start));
 
   T result{};
@@ -223,11 +216,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
   }
 
   uint64_t stop = gpu::processor_clock();
-  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"s"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   volatile auto output = result;
-  (void)output;
 
   const uint64_t measured = stop - start;
   return measured > baseline ? (measured - baseline) : 0;
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index e0a069c6c5454..133032ca08423 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -105,7 +105,6 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
-
   asm("" ::"llr"(start));
 
   T result{};
@@ -116,11 +115,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
   }
 
   uint64_t stop = gpu::processor_clock();
-  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   volatile auto output = result;
-  (void)output;
 
   return stop - start;
 }
@@ -136,7 +134,6 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
-
   asm("" ::"llr"(start));
 
   T result{};
@@ -147,11 +144,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
   }
 
   uint64_t stop = gpu::processor_clock();
-  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   volatile auto output = result;
-  (void)output;
 
   const uint64_t measured = stop - start;
   return measured > baseline ? (measured - baseline) : 0;
@@ -166,7 +162,6 @@ static LIBC_INLINE uint64_t throughput_baseline(
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
-
   asm("" ::"llr"(start));
 
   T result{};
@@ -179,11 +174,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
   }
 
   uint64_t stop = gpu::processor_clock();
-  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   volatile auto output = result;
-  (void)output;
 
   return stop - start;
 }
@@ -200,7 +194,6 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
-
   asm("" ::"llr"(start));
 
   T result{};
@@ -213,11 +206,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
   }
 
   uint64_t stop = gpu::processor_clock();
-  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   volatile auto output = result;
-  (void)output;
 
   const uint64_t measured = stop - start;
   return measured > baseline ? (measured - baseline) : 0;

>From a11e7754c5fd7cac1f0b36252845a5bada11353b Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 14 Aug 2025 20:08:53 -0300
Subject: [PATCH 09/10] Allow index-less benchmarks via `BenchmarkTarget`
 wrapper

---
 libc/benchmarks/gpu/CMakeLists.txt            |  1 -
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      |  8 ++--
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 44 +++++++++++++++----
 libc/benchmarks/gpu/src/ctype/CMakeLists.txt  |  2 -
 .../gpu/src/ctype/isalnum_benchmark.cpp       |  7 ++-
 .../gpu/src/ctype/isalpha_benchmark.cpp       |  3 +-
 6 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index ce3b0228c2076..beedac78d4826 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -49,7 +49,6 @@ add_unittest_framework_library(
     libc.src.__support.CPP.string
     libc.src.__support.CPP.string_view
     libc.src.__support.CPP.type_traits
-    libc.src.__support.CPP.functional
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.atomic
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 28a4ebfc6df19..93eededd6295a 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,4 +1,5 @@
 #include "LibcGpuBenchmark.h"
+
 #include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
@@ -161,9 +162,8 @@ void Benchmark::run_benchmarks() {
   gpu::sync_threads();
 }
 
-BenchmarkResult
-benchmark(const BenchmarkOptions &options,
-          const cpp::function<uint64_t(uint32_t)> &wrapper_func) {
+BenchmarkResult benchmark(const BenchmarkOptions &options,
+                          const BenchmarkTarget &target) {
   BenchmarkResult result;
   RuntimeEstimationProgression rep;
   uint32_t iterations = options.initial_iterations;
@@ -183,7 +183,7 @@ benchmark(const BenchmarkOptions &options,
 
     const clock_t start = clock();
     while (sample_estimator.get_iterations() < iterations) {
-      auto current_result = wrapper_func(call_index++);
+      auto current_result = target(call_index++);
       max = cpp::max(max, current_result);
       min = cpp::min(min, current_result);
       sample_estimator.update(current_result);
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index c4088d90f80fa..2ba441c6d1c95 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -6,7 +6,6 @@
 #include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
-#include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/type_traits.h"
@@ -121,21 +120,48 @@ struct BenchmarkResult {
   clock_t total_time = 0;
 };
 
-BenchmarkResult
-benchmark(const BenchmarkOptions &options,
-          const cpp::function<uint64_t(uint32_t)> &wrapper_func);
+struct BenchmarkTarget {
+  using IndexedFnPtr = uint64_t (*)(uint32_t);
+  using IndexlessFnPtr = uint64_t (*)();
+
+  enum class Kind : uint8_t { Indexed, Indexless } kind;
+  union {
+    IndexedFnPtr indexed_fn_ptr;
+    IndexlessFnPtr indexless_fn_ptr;
+  };
+
+  LIBC_INLINE BenchmarkTarget(IndexedFnPtr func)
+      : kind(Kind::Indexed), indexed_fn_ptr(func) {}
+  LIBC_INLINE BenchmarkTarget(IndexlessFnPtr func)
+      : kind(Kind::Indexless), indexless_fn_ptr(func) {}
+
+  LIBC_INLINE uint64_t operator()([[maybe_unused]] uint32_t call_index) const {
+    return kind == Kind::Indexed ? indexed_fn_ptr(call_index)
+                                 : indexless_fn_ptr();
+  }
+};
+
+BenchmarkResult benchmark(const BenchmarkOptions &options,
+                          const BenchmarkTarget &target);
 
 class Benchmark {
-  const cpp::function<uint64_t(uint32_t)> func;
+  const BenchmarkTarget target;
   const cpp::string_view suite_name;
   const cpp::string_view test_name;
   const uint32_t num_threads;
 
 public:
-  Benchmark(cpp::function<uint64_t(uint32_t)> func, char const *suite_name,
+  Benchmark(uint64_t (*f)(), const char *suite, const char *test,
+            uint32_t threads)
+      : target(BenchmarkTarget(f)), suite_name(suite), test_name(test),
+        num_threads(threads) {
+    add_benchmark(this);
+  }
+
+  Benchmark(uint64_t (*f)(uint32_t), char const *suite_name,
             char const *test_name, uint32_t num_threads)
-      : func(func), suite_name(suite_name), test_name(test_name),
-        num_threads(num_threads) {
+      : target(BenchmarkTarget(f)), suite_name(suite_name),
+        test_name(test_name), num_threads(num_threads) {
     add_benchmark(this);
   }
 
@@ -149,7 +175,7 @@ class Benchmark {
 private:
   BenchmarkResult run() {
     BenchmarkOptions options;
-    return benchmark(options, func);
+    return benchmark(options, target);
   }
 };
 
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index 77e2bbe538b1f..f277624dbb901 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -7,7 +7,6 @@ add_benchmark(
   SRCS
     isalnum_benchmark.cpp
   DEPENDS
-    libc.hdr.stdint_proxy
     libc.src.ctype.isalnum
   LOADER_ARGS
     --threads 64
@@ -20,6 +19,5 @@ add_benchmark(
   SRCS
     isalpha_benchmark.cpp
   DEPENDS
-    libc.hdr.stdint_proxy
     libc.src.ctype.isalpha
 )
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index 28b1ee52c8dfa..ffa5a99860bfc 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -1,9 +1,8 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
-#include "hdr/stdint_proxy.h"
 #include "src/ctype/isalnum.h"
 
-uint64_t BM_IsAlnum(uint32_t /*call_index*/) {
+uint64_t BM_IsAlnum() {
   char x = 'c';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
@@ -13,13 +12,13 @@ SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread,
 SINGLE_WAVE_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleWave,
                       BM_IsAlnum);
 
-uint64_t BM_IsAlnumCapital(uint32_t /*call_index*/) {
+uint64_t BM_IsAlnumCapital() {
   char x = 'A';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
 BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumCapital, BM_IsAlnumCapital);
 
-uint64_t BM_IsAlnumNotAlnum(uint32_t /*call_index*/) {
+uint64_t BM_IsAlnumNotAlnum() {
   char x = '{';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
index bff4edea8b690..2038eb89bc77b 100644
--- a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -1,9 +1,8 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
-#include "hdr/stdint_proxy.h"
 #include "src/ctype/isalpha.h"
 
-uint64_t BM_IsAlpha(uint32_t /*call_index*/) {
+uint64_t BM_IsAlpha() {
   char x = 'c';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
 }

>From b6860825e3851226fe7958bfd1239c4f11e03db0 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 15 Aug 2025 00:21:14 -0300
Subject: [PATCH 10/10] Correct statistics aggregation and reporting

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 141 ++++++++++++-----------
 libc/benchmarks/gpu/LibcGpuBenchmark.h   |   6 +-
 2 files changed, 78 insertions(+), 69 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 93eededd6295a..ef816c51a87d7 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -5,6 +5,7 @@
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/string.h"
+#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/sqrt.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/fixedvector.h"
@@ -21,37 +22,56 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
   benchmarks.push_back(benchmark);
 }
 
+static void atomic_add_double(cpp::Atomic<uint64_t> &atomic_bits,
+                              double value) {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+
+  uint64_t expected_bits = atomic_bits.load(cpp::MemoryOrder::RELAXED);
+
+  while (true) {
+    double current_value = FPBits(expected_bits).get_val();
+    double next_value = current_value + value;
+
+    uint64_t desired_bits = FPBits(next_value).uintval();
+    if (atomic_bits.compare_exchange_strong(expected_bits, desired_bits,
+                                            cpp::MemoryOrder::ACQUIRE,
+                                            cpp::MemoryOrder::RELAXED))
+      break;
+  }
+}
+
 struct AtomicBenchmarkSums {
-  cpp::Atomic<uint64_t> cycles_sum = 0;
-  cpp::Atomic<uint64_t> standard_deviation_sum = 0;
+  cpp::Atomic<uint32_t> active_threads = 0;
+  cpp::Atomic<uint64_t> iterations_sum = 0;
+  cpp::Atomic<uint64_t> weighted_cycles_sum_bits = 0;
+  cpp::Atomic<uint64_t> weighted_squared_cycles_sum_bits = 0;
   cpp::Atomic<uint64_t> min = UINT64_MAX;
   cpp::Atomic<uint64_t> max = 0;
-  cpp::Atomic<uint32_t> samples_sum = 0;
-  cpp::Atomic<uint32_t> iterations_sum = 0;
-  cpp::Atomic<clock_t> time_sum = 0;
-  cpp::Atomic<uint64_t> active_threads = 0;
 
   void reset() {
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
     active_threads.store(0, cpp::MemoryOrder::RELAXED);
-    cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
-    standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
+    iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
+    weighted_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
+    weighted_squared_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
     min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
     max.store(0, cpp::MemoryOrder::RELAXED);
-    samples_sum.store(0, cpp::MemoryOrder::RELAXED);
-    iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
-    time_sum.store(0, cpp::MemoryOrder::RELAXED);
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
   }
 
   void update(const BenchmarkResult &result) {
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
     active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
+    iterations_sum.fetch_add(result.total_iterations,
+                             cpp::MemoryOrder::RELAXED);
 
-    cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED);
-    standard_deviation_sum.fetch_add(
-        static_cast<uint64_t>(result.standard_deviation),
-        cpp::MemoryOrder::RELAXED);
+    const double n_i = static_cast<double>(result.total_iterations);
+    const double mean_i = result.cycles;
+    const double stddev_i = result.standard_deviation;
+    const double variance_i = stddev_i * stddev_i;
+    atomic_add_double(weighted_cycles_sum_bits, n_i * mean_i);
+    atomic_add_double(weighted_squared_cycles_sum_bits,
+                      n_i * (variance_i + mean_i * mean_i));
 
     // Perform a CAS loop to atomically update the min
     uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
@@ -67,10 +87,6 @@ struct AtomicBenchmarkSums {
         cpp::MemoryOrder::RELAXED))
       ;
 
-    samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED);
-    iterations_sum.fetch_add(result.total_iterations,
-                             cpp::MemoryOrder::RELAXED);
-    time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED);
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
   }
 };
@@ -80,46 +96,49 @@ constexpr auto GREEN = "\033[32m";
 constexpr auto RESET = "\033[0m";
 
 void print_results(Benchmark *b) {
-  BenchmarkResult result;
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+
+  BenchmarkResult final_result;
   cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
-  int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
-  result.cycles =
-      all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  result.standard_deviation =
-      all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) /
-      num_threads;
-  result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
-  result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
-  result.samples =
-      all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  result.total_iterations =
-      all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  const uint64_t duration_ns =
-      all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  const uint64_t duration_us = duration_ns / 1000;
-  const uint64_t duration_ms = duration_ns / (1000 * 1000);
-  uint64_t converted_duration = duration_ns;
-  const char *time_unit;
-  if (duration_ms != 0) {
-    converted_duration = duration_ms;
-    time_unit = "ms";
-  } else if (duration_us != 0) {
-    converted_duration = duration_us;
-    time_unit = "us";
+
+  const uint32_t num_threads =
+      all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
+  final_result.total_iterations =
+      all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED);
+
+  if (final_result.total_iterations > 0) {
+    const uint64_t s1_bits =
+        all_results.weighted_cycles_sum_bits.load(cpp::MemoryOrder::RELAXED);
+    const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits.load(
+        cpp::MemoryOrder::RELAXED);
+
+    const double S1 = FPBits(s1_bits).get_val();
+    const double S2 = FPBits(s2_bits).get_val();
+    const double N = static_cast<double>(final_result.total_iterations);
+
+    const double global_mean = S1 / N;
+    const double global_mean_of_squares = S2 / N;
+    const double global_variance =
+        global_mean_of_squares - (global_mean * global_mean);
+
+    final_result.cycles = global_mean;
+    final_result.standard_deviation =
+        fputil::sqrt<double>(global_variance < 0.0 ? 0.0 : global_variance);
   } else {
-    converted_duration = duration_ns;
-    time_unit = "ns";
+    final_result.cycles = 0.0;
+    final_result.standard_deviation = 0.0;
   }
-  result.total_time = converted_duration;
-  // result.total_time =
-  //     all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+
+  final_result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
+  final_result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
   cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
 
   LIBC_NAMESPACE::printf(
-      "%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
-      b->get_test_name().data(), result.cycles, result.min, result.max,
-      result.total_iterations, result.total_time, time_unit,
-      static_cast<uint64_t>(result.standard_deviation), num_threads);
+      "%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
+      b->get_test_name().data(), final_result.cycles,
+      final_result.standard_deviation, (unsigned long long)final_result.min,
+      (unsigned long long)final_result.max,
+      (unsigned long long)final_result.total_iterations, (unsigned)num_threads);
 }
 
 void print_header() {
@@ -127,9 +146,8 @@ void print_header() {
   LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
                          benchmarks[0]->get_suite_name().data());
   LIBC_NAMESPACE::printf("%s", RESET);
-  cpp::string titles =
-      "Benchmark                |  Cycles |     Min |     Max | "
-      "Iterations | Time / Iteration |   Stddev |  Threads |\n";
+  cpp::string titles = "Benchmark                |  Cycles (Mean) |   Stddev | "
+                       "    Min |     Max | Iterations |  Threads |\n";
   LIBC_NAMESPACE::printf(titles.data());
 
   cpp::string separator(titles.size(), '-');
@@ -212,18 +230,11 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
   }
 
   const auto &estimator = rep.get_estimator();
-  result.cycles = static_cast<uint64_t>(estimator.get_mean());
+  result.total_iterations = estimator.get_iterations();
+  result.cycles = estimator.get_mean();
   result.standard_deviation = estimator.get_stddev();
-
   result.min = min;
   result.max = max;
-  result.samples = samples;
-
-  result.total_iterations = estimator.get_iterations();
-  if (result.total_iterations > 0)
-    result.total_time = total_time / result.total_iterations;
-  else
-    result.total_time = 0;
 
   return result;
 }
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2ba441c6d1c95..60f69edf86556 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -111,13 +111,11 @@ class RuntimeEstimationProgression {
 };
 
 struct BenchmarkResult {
-  uint64_t cycles = 0;
+  uint64_t total_iterations = 0;
+  double cycles = 0;
   double standard_deviation = 0;
   uint64_t min = UINT64_MAX;
   uint64_t max = 0;
-  uint32_t samples = 0;
-  uint32_t total_iterations = 0;
-  clock_t total_time = 0;
 };
 
 struct BenchmarkTarget {