[libc-commits] [libc] 08ff017 - [libc] Improve GPU benchmarking (#153512)

Fri Aug 15 09:00:20 PDT 2025

Author: Leandro Lacerda
Date: 2025-08-15T11:00:17-05:00
New Revision: 08ff017fb0c9c7c3c91858023ea45149449fbbfc

URL: https://github.com/llvm/llvm-project/commit/08ff017fb0c9c7c3c91858023ea45149449fbbfc
DIFF: https://github.com/llvm/llvm-project/commit/08ff017fb0c9c7c3c91858023ea45149449fbbfc.diff

LOG: [libc] Improve GPU benchmarking (#153512)

This patch improves the GPU benchmarking in this way:

* Replace `rand`/`srand` with a deterministic per-thread RNG seeded by
`call_index`: reproducible, apples-to-apples libc vs vendor comparisons.
* Fix input generation: sample the unbiased exponent uniformly in
`[min_exp, max_exp]`, clamp bounds, and skip `Inf`, `NaN`, `-0.0`, and
`+0.0`.
* Fix standard deviation: use an explicit estimator from sums and
sums-of-squares (`sqrt(E[x^2] − E[x]^2)`) across samples.
* Fix throughput overhead: subtract a loop-only baseline inside
NVPTX/AMDGPU timing backends so `benchmark()` gets cycles-per-call
already corrected (no `overhead()` call).
* Adapt existing math benchmarks to the new RNG/timing plumbing (plumb
`call_index`, drop `rand/srand`, clean includes).
* Correct inter-thread aggregation: use iteration-weighted pooling to
compute the global mean/variance, ensuring statistically sound `Cycles
(Mean)` and `Stddev`.
* Remove `Time / Iteration` column from the results table: it reported
per-thread convergence time (not per-call latency) and was
redundant/misleading next to `Cycles (Mean)`.
* Remove unused `BenchmarkLogger` files: dead code that added
maintenance and cognitive overhead without providing functionality.

---

## TODO (before merge)

* [ ] Investigate compiler warnings and address their root causes.
* [x] Review how per-thread results are aggregated into the overall
result.

## Follow-ups (future PRs)

* Add support to run throughput benchmarks with uniform (linear) input
distributions, alongside the current log2-uniform scheme.
* Review/adjust the configuration and coverage of existing math
benchmarks.
* Add more math benchmarks (e.g., `exp`/`expf`, others).

Added: 
    

Modified: 
    libc/benchmarks/gpu/CMakeLists.txt
    libc/benchmarks/gpu/LibcGpuBenchmark.cpp
    libc/benchmarks/gpu/LibcGpuBenchmark.h
    libc/benchmarks/gpu/src/math/CMakeLists.txt
    libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
    libc/benchmarks/gpu/src/math/sin_benchmark.cpp
    libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
    libc/benchmarks/gpu/timing/amdgpu/timing.h
    libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
    libc/benchmarks/gpu/timing/nvptx/timing.h

Removed: 
    


################################################################################
diff  --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 6ec64bf270b53..beedac78d4826 100644

--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -22,8 +22,6 @@ function(add_benchmark benchmark_name)
       ${BENCHMARK_LINK_LIBRARIES}
     DEPENDS
       libc.src.stdio.printf
-      libc.src.stdlib.srand
-      libc.src.stdlib.rand
       ${BENCHMARK_DEPENDS}
     ${BENCHMARK_UNPARSED_ARGUMENTS}
     COMPILE_OPTIONS
@@ -51,7 +49,6 @@ add_unittest_framework_library(
     libc.src.__support.CPP.string
     libc.src.__support.CPP.string_view
     libc.src.__support.CPP.type_traits
-    libc.src.__support.CPP.functional
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.atomic
@@ -64,8 +61,6 @@ add_unittest_framework_library(
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.fixedvector
     libc.src.time.clock
-    libc.src.stdlib.rand
-    libc.src.stdlib.srand
     libc.benchmarks.gpu.timing.timing
     libc.src.stdio.printf
 )

diff  --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 57ff5b9fdb846..ef816c51a87d7 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,15 +1,17 @@
 #include "LibcGpuBenchmark.h"
+
+#include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/string.h"
+#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/sqrt.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"
 #include "src/stdio/printf.h"
-#include "src/stdlib/srand.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace benchmarks {
@@ -20,37 +22,56 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
   benchmarks.push_back(benchmark);
 }
 
+static void atomic_add_double(cpp::Atomic<uint64_t> &atomic_bits,
+                              double value) {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+
+  uint64_t expected_bits = atomic_bits.load(cpp::MemoryOrder::RELAXED);
+
+  while (true) {
+    double current_value = FPBits(expected_bits).get_val();
+    double next_value = current_value + value;
+
+    uint64_t desired_bits = FPBits(next_value).uintval();
+    if (atomic_bits.compare_exchange_strong(expected_bits, desired_bits,
+                                            cpp::MemoryOrder::ACQUIRE,
+                                            cpp::MemoryOrder::RELAXED))
+      break;
+  }
+}
+
 struct AtomicBenchmarkSums {
-  cpp::Atomic<uint64_t> cycles_sum = 0;
-  cpp::Atomic<uint64_t> standard_deviation_sum = 0;
+  cpp::Atomic<uint32_t> active_threads = 0;
+  cpp::Atomic<uint64_t> iterations_sum = 0;
+  cpp::Atomic<uint64_t> weighted_cycles_sum_bits = 0;
+  cpp::Atomic<uint64_t> weighted_squared_cycles_sum_bits = 0;
   cpp::Atomic<uint64_t> min = UINT64_MAX;
   cpp::Atomic<uint64_t> max = 0;
-  cpp::Atomic<uint32_t> samples_sum = 0;
-  cpp::Atomic<uint32_t> iterations_sum = 0;
-  cpp::Atomic<clock_t> time_sum = 0;
-  cpp::Atomic<uint64_t> active_threads = 0;
 
   void reset() {
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
     active_threads.store(0, cpp::MemoryOrder::RELAXED);
-    cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
-    standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
+    iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
+    weighted_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
+    weighted_squared_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
     min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
     max.store(0, cpp::MemoryOrder::RELAXED);
-    samples_sum.store(0, cpp::MemoryOrder::RELAXED);
-    iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
-    time_sum.store(0, cpp::MemoryOrder::RELAXED);
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
   }
 
   void update(const BenchmarkResult &result) {
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
     active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
+    iterations_sum.fetch_add(result.total_iterations,
+                             cpp::MemoryOrder::RELAXED);
 
-    cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED);
-    standard_deviation_sum.fetch_add(
-        static_cast<uint64_t>(result.standard_deviation),
-        cpp::MemoryOrder::RELAXED);
+    const double n_i = static_cast<double>(result.total_iterations);
+    const double mean_i = result.cycles;
+    const double stddev_i = result.standard_deviation;
+    const double variance_i = stddev_i * stddev_i;
+    atomic_add_double(weighted_cycles_sum_bits, n_i * mean_i);
+    atomic_add_double(weighted_squared_cycles_sum_bits,
+                      n_i * (variance_i + mean_i * mean_i));
 
     // Perform a CAS loop to atomically update the min
     uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
@@ -66,10 +87,6 @@ struct AtomicBenchmarkSums {
         cpp::MemoryOrder::RELAXED))
       ;
 
-    samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED);
-    iterations_sum.fetch_add(result.total_iterations,
-                             cpp::MemoryOrder::RELAXED);
-    time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED);
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
   }
 };
@@ -79,46 +96,49 @@ constexpr auto GREEN = "\033[32m";
 constexpr auto RESET = "\033[0m";
 
 void print_results(Benchmark *b) {
-  BenchmarkResult result;
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+
+  BenchmarkResult final_result;
   cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
-  int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
-  result.cycles =
-      all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  result.standard_deviation =
-      all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) /
-      num_threads;
-  result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
-  result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
-  result.samples =
-      all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  result.total_iterations =
-      all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  const uint64_t duration_ns =
-      all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  const uint64_t duration_us = duration_ns / 1000;
-  const uint64_t duration_ms = duration_ns / (1000 * 1000);
-  uint64_t converted_duration = duration_ns;
-  const char *time_unit;
-  if (duration_ms != 0) {
-    converted_duration = duration_ms;
-    time_unit = "ms";
-  } else if (duration_us != 0) {
-    converted_duration = duration_us;
-    time_unit = "us";
+
+  const uint32_t num_threads =
+      all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
+  final_result.total_iterations =
+      all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED);
+
+  if (final_result.total_iterations > 0) {
+    const uint64_t s1_bits =
+        all_results.weighted_cycles_sum_bits.load(cpp::MemoryOrder::RELAXED);
+    const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits.load(
+        cpp::MemoryOrder::RELAXED);
+
+    const double S1 = FPBits(s1_bits).get_val();
+    const double S2 = FPBits(s2_bits).get_val();
+    const double N = static_cast<double>(final_result.total_iterations);
+
+    const double global_mean = S1 / N;
+    const double global_mean_of_squares = S2 / N;
+    const double global_variance =
+        global_mean_of_squares - (global_mean * global_mean);
+
+    final_result.cycles = global_mean;
+    final_result.standard_deviation =
+        fputil::sqrt<double>(global_variance < 0.0 ? 0.0 : global_variance);
   } else {
-    converted_duration = duration_ns;
-    time_unit = "ns";
+    final_result.cycles = 0.0;
+    final_result.standard_deviation = 0.0;
   }
-  result.total_time = converted_duration;
-  // result.total_time =
-  //     all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+
+  final_result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
+  final_result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
   cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
 
   LIBC_NAMESPACE::printf(
-      "%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
-      b->get_test_name().data(), result.cycles, result.min, result.max,
-      result.total_iterations, result.total_time, time_unit,
-      static_cast<uint64_t>(result.standard_deviation), num_threads);
+      "%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
+      b->get_test_name().data(), final_result.cycles,
+      final_result.standard_deviation, (unsigned long long)final_result.min,
+      (unsigned long long)final_result.max,
+      (unsigned long long)final_result.total_iterations, (unsigned)num_threads);
 }
 
 void print_header() {
@@ -126,9 +146,8 @@ void print_header() {
   LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
                          benchmarks[0]->get_suite_name().data());
   LIBC_NAMESPACE::printf("%s", RESET);
-  cpp::string titles =
-      "Benchmark                |  Cycles |     Min |     Max | "
-      "Iterations | Time / Iteration |   Stddev |  Threads |\n";
+  cpp::string titles = "Benchmark                |  Cycles (Mean) |   Stddev | "
+                       "    Min |     Max | Iterations |  Threads |\n";
   LIBC_NAMESPACE::printf(titles.data());
 
   cpp::string separator(titles.size(), '-');
@@ -139,10 +158,8 @@ void print_header() {
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
 
-  if (id == 0) {
+  if (id == 0)
     print_header();
-    LIBC_NAMESPACE::srand(gpu::processor_clock());
-  }
 
   gpu::sync_threads();
 
@@ -164,69 +181,63 @@ void Benchmark::run_benchmarks() {
 }
 
 BenchmarkResult benchmark(const BenchmarkOptions &options,
-                          cpp::function<uint64_t(void)> wrapper_func) {
+                          const BenchmarkTarget &target) {
   BenchmarkResult result;
   RuntimeEstimationProgression rep;
-  uint32_t total_iterations = 0;
   uint32_t iterations = options.initial_iterations;
+
   if (iterations < 1u)
     iterations = 1;
 
   uint32_t samples = 0;
   uint64_t total_time = 0;
-  uint64_t best_guess = 0;
-  uint64_t cycles_squared = 0;
   uint64_t min = UINT64_MAX;
   uint64_t max = 0;
 
-  uint64_t overhead = UINT64_MAX;
-  int overhead_iterations = 10;
-  for (int i = 0; i < overhead_iterations; i++)
-    overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
+  uint32_t call_index = 0;
 
   for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
-    uint64_t sample_cycles = 0;
-    const clock_t start = static_cast<double>(clock());
-    for (uint32_t i = 0; i < iterations; i++) {
-      auto wrapper_intermediate = wrapper_func();
-      uint64_t current_result = wrapper_intermediate - overhead;
+    RefinableRuntimeEstimator sample_estimator;
+
+    const clock_t start = clock();
+    while (sample_estimator.get_iterations() < iterations) {
+      auto current_result = target(call_index++);
       max = cpp::max(max, current_result);
       min = cpp::min(min, current_result);
-      sample_cycles += current_result;
+      sample_estimator.update(current_result);
     }
     const clock_t end = clock();
+
     const clock_t duration_ns =
         ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
     total_time += duration_ns;
     time_budget -= duration_ns;
     samples++;
-    cycles_squared += sample_cycles * sample_cycles;
 
-    total_iterations += iterations;
-    const double change_ratio =
-        rep.compute_improvement({iterations, sample_cycles});
-    best_guess = rep.current_estimation;
+    const double change_ratio = rep.compute_improvement(sample_estimator);
 
     if (samples >= options.max_samples || iterations >= options.max_iterations)
       break;
+
+    const auto total_iterations = rep.get_estimator().get_iterations();
+
     if (total_time >= options.min_duration && samples >= options.min_samples &&
         total_iterations >= options.min_iterations &&
         change_ratio < options.epsilon)
       break;
 
-    iterations *= options.scaling_factor;
+    iterations = static_cast<uint32_t>(iterations * options.scaling_factor);
   }
-  result.cycles = best_guess;
-  result.standard_deviation = fputil::sqrt<double>(
-      static_cast<double>(cycles_squared) / total_iterations -
-      static_cast<double>(best_guess * best_guess));
+
+  const auto &estimator = rep.get_estimator();
+  result.total_iterations = estimator.get_iterations();
+  result.cycles = estimator.get_mean();
+  result.standard_deviation = estimator.get_stddev();
   result.min = min;
   result.max = max;
-  result.samples = samples;
-  result.total_iterations = total_iterations;
-  result.total_time = total_time / total_iterations;
+
   return result;
-};
+}
 
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE_DECL

diff  --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index a6cf62dd30ce5..60f69edf86556 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -4,14 +4,14 @@
 #include "benchmarks/gpu/BenchmarkLogger.h"
 #include "benchmarks/gpu/timing/timing.h"
 #include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
-#include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/sqrt.h"
 #include "src/__support/macros/config.h"
-#include "src/stdlib/rand.h"
 #include "src/time/clock.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -30,68 +30,136 @@ struct BenchmarkOptions {
   double scaling_factor = 1.4;
 };
 
-struct Measurement {
+class RefinableRuntimeEstimator {
   uint32_t iterations = 0;
-  uint64_t elapsed_cycles = 0;
-};
-
-class RefinableRuntimeEstimation {
-  uint64_t total_cycles = 0;
-  uint32_t total_iterations = 0;
+  uint64_t sum_of_cycles = 0;
+  uint64_t sum_of_squared_cycles = 0;
 
 public:
-  uint64_t update(const Measurement &M) {
-    total_cycles += M.elapsed_cycles;
-    total_iterations += M.iterations;
-    return total_cycles / total_iterations;
+  void update(uint64_t cycles) noexcept {
+    iterations += 1;
+    sum_of_cycles += cycles;
+    sum_of_squared_cycles += cycles * cycles;
+  }
+
+  void update(const RefinableRuntimeEstimator &other) noexcept {
+    iterations += other.iterations;
+    sum_of_cycles += other.sum_of_cycles;
+    sum_of_squared_cycles += other.sum_of_squared_cycles;
+  }
+
+  double get_mean() const noexcept {
+    if (iterations == 0)
+      return 0.0;
+
+    return static_cast<double>(sum_of_cycles) / iterations;
+  }
+
+  double get_variance() const noexcept {
+    if (iterations == 0)
+      return 0.0;
+
+    const double num = static_cast<double>(iterations);
+    const double sum_x = static_cast<double>(sum_of_cycles);
+    const double sum_x2 = static_cast<double>(sum_of_squared_cycles);
+
+    const double mean_of_squares = sum_x2 / num;
+    const double mean = sum_x / num;
+    const double mean_squared = mean * mean;
+    const double variance = mean_of_squares - mean_squared;
+
+    return variance < 0.0 ? 0.0 : variance;
   }
+
+  double get_stddev() const noexcept {
+    return fputil::sqrt<double>(get_variance());
+  }
+
+  uint32_t get_iterations() const noexcept { return iterations; }
 };
 
 // Tracks the progression of the runtime estimation
 class RuntimeEstimationProgression {
-  RefinableRuntimeEstimation rre;
+  RefinableRuntimeEstimator estimator;
+  double current_mean = 0.0;
 
 public:
-  uint64_t current_estimation = 0;
+  const RefinableRuntimeEstimator &get_estimator() const noexcept {
+    return estimator;
+  }
+
+  double
+  compute_improvement(const RefinableRuntimeEstimator &sample_estimator) {
+    if (sample_estimator.get_iterations() == 0)
+      return 1.0;
 
-  double compute_improvement(const Measurement &M) {
-    const uint64_t new_estimation = rre.update(M);
-    double ratio =
-        (static_cast<double>(current_estimation) / new_estimation) - 1.0;
+    estimator.update(sample_estimator);
 
-    // Get absolute value
+    const double new_mean = estimator.get_mean();
+    if (current_mean == 0.0 || new_mean == 0.0) {
+      current_mean = new_mean;
+      return 1.0;
+    }
+
+    double ratio = (current_mean / new_mean) - 1.0;
     if (ratio < 0)
-      ratio *= -1;
+      ratio = -ratio;
 
-    current_estimation = new_estimation;
+    current_mean = new_mean;
     return ratio;
   }
 };
 
 struct BenchmarkResult {
-  uint64_t cycles = 0;
+  uint64_t total_iterations = 0;
+  double cycles = 0;
   double standard_deviation = 0;
   uint64_t min = UINT64_MAX;
   uint64_t max = 0;
-  uint32_t samples = 0;
-  uint32_t total_iterations = 0;
-  clock_t total_time = 0;
+};
+
+struct BenchmarkTarget {
+  using IndexedFnPtr = uint64_t (*)(uint32_t);
+  using IndexlessFnPtr = uint64_t (*)();
+
+  enum class Kind : uint8_t { Indexed, Indexless } kind;
+  union {
+    IndexedFnPtr indexed_fn_ptr;
+    IndexlessFnPtr indexless_fn_ptr;
+  };
+
+  LIBC_INLINE BenchmarkTarget(IndexedFnPtr func)
+      : kind(Kind::Indexed), indexed_fn_ptr(func) {}
+  LIBC_INLINE BenchmarkTarget(IndexlessFnPtr func)
+      : kind(Kind::Indexless), indexless_fn_ptr(func) {}
+
+  LIBC_INLINE uint64_t operator()([[maybe_unused]] uint32_t call_index) const {
+    return kind == Kind::Indexed ? indexed_fn_ptr(call_index)
+                                 : indexless_fn_ptr();
+  }
 };
 
 BenchmarkResult benchmark(const BenchmarkOptions &options,
-                          cpp::function<uint64_t(void)> wrapper_func);
+                          const BenchmarkTarget &target);
 
 class Benchmark {
-  const cpp::function<uint64_t(void)> func;
+  const BenchmarkTarget target;
   const cpp::string_view suite_name;
   const cpp::string_view test_name;
   const uint32_t num_threads;
 
 public:
-  Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
+  Benchmark(uint64_t (*f)(), const char *suite, const char *test,
+            uint32_t threads)
+      : target(BenchmarkTarget(f)), suite_name(suite), test_name(test),
+        num_threads(threads) {
+    add_benchmark(this);
+  }
+
+  Benchmark(uint64_t (*f)(uint32_t), char const *suite_name,
             char const *test_name, uint32_t num_threads)
-      : func(func), suite_name(suite_name), test_name(test_name),
-        num_threads(num_threads) {
+      : target(BenchmarkTarget(f)), suite_name(suite_name),
+        test_name(test_name), num_threads(num_threads) {
     add_benchmark(this);
   }
 
@@ -105,67 +173,139 @@ class Benchmark {
 private:
   BenchmarkResult run() {
     BenchmarkOptions options;
-    return benchmark(options, func);
+    return benchmark(options, target);
+  }
+};
+
+class RandomGenerator {
+  uint64_t state;
+
+  static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept {
+    x += 0x9E3779B97F4A7C15ULL;
+    x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
+    x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
+    x = (x ^ (x >> 31));
+    return x ? x : 0x9E3779B97F4A7C15ULL;
+  }
+
+public:
+  explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept
+      : state(splitmix64(seed)) {}
+
+  LIBC_INLINE uint64_t next64() noexcept {
+    uint64_t x = state;
+    x ^= x >> 12;
+    x ^= x << 25;
+    x ^= x >> 27;
+    state = x;
+    return x * 0x2545F4914F6CDD1DULL;
+  }
+
+  LIBC_INLINE uint32_t next32() noexcept {
+    return static_cast<uint32_t>(next64() >> 32);
   }
 };
 
-// We want our random values to be approximately
-// Output: a random number with the exponent field between min_exp and max_exp,
-// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
-// Caveats:
-//   -EXP_BIAS corresponding to denormal values,
-//   EXP_BIAS + 1 corresponding to inf or nan.
+// We want random floating-point values whose *unbiased* exponent e is
+// approximately uniform in [min_exp, max_exp]. That is,
+//   2^min_exp <= |value| < 2^(max_exp + 1).
+// Caveats / boundaries:
+// - e = -EXP_BIAS  ==> subnormal range (biased exponent = 0). We ensure a
+//                      non-zero mantissa so we don't accidentally produce 0.
+// - e in [1 - EXP_BIAS, EXP_BIAS] ==> normal numbers.
+// - e = EXP_BIAS + 1 ==> Inf/NaN. We do not include it by default; max_exp
+//                        defaults to EXP_BIAS.
 template <typename T>
 static T
-get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
-               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
+get_rand_input(RandomGenerator &rng,
+               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+               int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-
-  // Required to correctly instantiate FPBits for floats and doubles.
-  using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
-                                               uint64_t, uint32_t>;
-  RandType bits;
-  if constexpr (cpp::is_same_v<T, uint64_t>)
-    bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
-           static_cast<uint64_t>(LIBC_NAMESPACE::rand());
-  else
-    bits = LIBC_NAMESPACE::rand();
-  double scale =
-      static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
-  FPBits fp(bits);
-  fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
-  return fp.get_val();
+  using Storage = typename FPBits::StorageType;
+
+  // Sanitize and clamp requested range to what the format supports
+  if (min_exp > max_exp) {
+    auto tmp = min_exp;
+    min_exp = max_exp;
+    max_exp = tmp;
+  };
+  min_exp = cpp::max(min_exp, -FPBits::EXP_BIAS);
+  max_exp = cpp::min(max_exp, FPBits::EXP_BIAS);
+
+  // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo
+  // bias
+  auto sample_in_range = [&](uint64_t r) -> int32_t {
+    const uint64_t range = static_cast<uint64_t>(
+        static_cast<int64_t>(max_exp) - static_cast<int64_t>(min_exp) + 1);
+    const uint64_t threshold = (-range) % range;
+    while (r < threshold)
+      r = rng.next64();
+    return static_cast<int32_t>(min_exp + static_cast<int64_t>(r % range));
+  };
+  const int32_t e = sample_in_range(rng.next64());
+
+  // Start from random bits to get random sign and mantissa
+  FPBits xbits([&] {
+    if constexpr (cpp::is_same_v<T, double>)
+      return FPBits(rng.next64());
+    else
+      return FPBits(rng.next32());
+  }());
+
+  if (e == -FPBits::EXP_BIAS) {
+    // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0
+    xbits.set_biased_exponent(Storage(0));
+    if (xbits.get_mantissa() == Storage(0))
+      xbits.set_mantissa(Storage(1));
+  } else {
+    // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS]
+    const int32_t biased = e + FPBits::EXP_BIAS;
+    xbits.set_biased_exponent(static_cast<Storage>(biased));
+  }
+  return xbits.get_val();
 }
 
 template <typename T> class MathPerf {
-  using FPBits = fputil::FPBits<T>;
-  using StorageType = typename FPBits::StorageType;
-  static constexpr StorageType UIntMax =
-      cpp::numeric_limits<StorageType>::max();
+  static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) {
+    const uint64_t tid = gpu::get_thread_id();
+    return base_seed ^ (salt << 32) ^ (tid * 0x9E3779B97F4A7C15ULL);
+  }
 
 public:
+  // Returns cycles-per-call (lower is better)
   template <size_t N = 1>
-  static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
+  static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
+                                          uint32_t call_index) {
     cpp::array<T, N> inputs;
+
+    uint64_t base_seed = static_cast<uint64_t>(call_index);
+    uint64_t salt = static_cast<uint64_t>(N);
+    RandomGenerator rng(make_seed(base_seed, salt));
+
     for (size_t i = 0; i < N; ++i)
-      inputs[i] = get_rand_input<T>(min_exp, max_exp);
+      inputs[i] = get_rand_input<T>(rng, min_exp, max_exp);
 
     uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
 
     return total_time / N;
   }
 
-  // Throughput benchmarking for functions that take 2 inputs.
+  // Returns cycles-per-call (lower is better)
   template <size_t N = 1>
   static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
                                           int arg1_max_exp, int arg2_min_exp,
-                                          int arg2_max_exp) {
+                                          int arg2_max_exp,
+                                          uint32_t call_index) {
     cpp::array<T, N> inputs1;
     cpp::array<T, N> inputs2;
+
+    uint64_t base_seed = static_cast<uint64_t>(call_index);
+    uint64_t salt = static_cast<uint64_t>(N);
+    RandomGenerator rng(make_seed(base_seed, salt));
+
     for (size_t i = 0; i < N; ++i) {
-      inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
-      inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
+      inputs1[i] = get_rand_input<T>(rng, arg1_min_exp, arg1_max_exp);
+      inputs2[i] = get_rand_input<T>(rng, arg2_min_exp, arg2_max_exp);
     }
 
     uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
@@ -193,4 +333,5 @@ template <typename T> class MathPerf {
 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
   BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
                       LIBC_NAMESPACE::gpu::get_lane_size())
-#endif
+
+#endif // LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H

diff  --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 7a12ce4e61c9e..8417f23c124a0 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -34,11 +34,6 @@ add_benchmark(
     libc.hdr.stdint_proxy
     libc.src.math.sin
     libc.src.math.sinf
-    libc.src.stdlib.srand
-    libc.src.stdlib.rand
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.array
   COMPILE_OPTIONS
     ${math_benchmark_flags}
   LOADER_ARGS
@@ -54,11 +49,6 @@ add_benchmark(
   DEPENDS
     libc.hdr.stdint_proxy
     libc.src.math.atan2
-    libc.src.stdlib.srand
-    libc.src.stdlib.rand
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.array
   COMPILE_OPTIONS
     ${math_benchmark_flags}
   LOADER_ARGS

diff  --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
index 1f91a9a35c373..82bb0c5d7de49 100644
--- a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
@@ -1,27 +1,27 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
+#include "hdr/stdint_proxy.h"
 #include "src/math/atan2.h"
-#include "src/stdlib/rand.h"
 
 #if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
 #include "platform.h"
 #endif
 
-#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                      \
-  []() {                                                                       \
+#define BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, N)                           \
+  [](uint32_t call_index) {                                                    \
     return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range<   \
-        N>(Func, MIN_EXP, MAX_EXP, MIN_EXP, MAX_EXP);                          \
+        N>(Func, MinExp, MaxExp, MinExp, MaxExp, call_index);                  \
   }
 
-#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP)                                 \
+#define BENCH(T, Name, Func, MinExp, MaxExp)                                   \
   SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1,                   \
-                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1));    \
+                        BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 1));         \
   SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_128,                 \
-                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128));  \
+                        BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 128));       \
   SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1024,                \
-                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \
+                        BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 1024));      \
   SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_4096,                \
-                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
+                        BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, 4096))
 
 BENCH(double, Atan2, LIBC_NAMESPACE::atan2, -1023, 1023);
 BENCH(double, Atan2TwoPi, LIBC_NAMESPACE::atan2, -10, 3);

diff  --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index a759db2e9d33f..5fe95c3f3b268 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -1,36 +1,28 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
-#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/functional.h"
-#include "src/__support/FPUtil/FPBits.h"
+#include "hdr/stdint_proxy.h"
 #include "src/math/sin.h"
 #include "src/math/sinf.h"
-#include "src/stdlib/rand.h"
 
 #if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
 #include "platform.h"
 #endif
 
-// BENCHMARK() expects a function that with no parameters that returns a
-// uint64_t representing the latency. Defining each benchmark using macro that
-// expands to a lambda to allow us to switch the implementation of `sin()` to
-// easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                          \
-  []() {                                                                       \
+#define BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, N)                            \
+  [](uint32_t call_index) {                                                    \
     return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range<   \
-        N>(Func, MIN_EXP, MAX_EXP);                                            \
+        N>(Func, MinExp, MaxExp, call_index);                                  \
   }
 
-#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP)                                 \
+#define BENCH(T, Name, Func, MinExp, MaxExp)                                   \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1,                     \
-                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1));        \
+                        BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 1));          \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128,                   \
-                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128));      \
+                        BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 128));        \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024,                  \
-                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024));     \
+                        BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 1024));       \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096,                  \
-                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
+                        BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, 4096))
 
 BENCH(double, Sin, LIBC_NAMESPACE::sin, -1023, 1023);
 BENCH(double, SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);

diff  --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index dd7c2d342f70f..d6a89d04dab97 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -7,6 +7,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.macros.config
     libc.src.__support.macros.attributes
-    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.array
+    libc.src.__support.CPP.type_traits
 )

diff  --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 37dbb9af5976b..de721a2d6ce6b 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
 #include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
@@ -105,20 +106,80 @@ template <typename F, typename T1, typename T2>
   return stop - start;
 }
 
-// Provides throughput benchmarking.
-template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t
-throughput(F f, const cpp::array<T, N> &inputs) {
+// Provides the *baseline* for throughput: measures loop and measurement costs
+// without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t
+throughput_baseline(const cpp::array<T, N> &inputs) {
   asm("" ::"v"(&inputs));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
+  asm("" ::"s"(start));
+
+  T result{};
+  for (auto input : inputs) {
+    asm("" ::"v"(input));
+    result = input;
+    asm("" ::"v"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+
+  volatile auto output = result;
 
+  return stop - start;
+}
+
+// Provides throughput benchmarking
+template <typename F, typename T, size_t N>
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs));
+
+  asm("" ::"v"(&inputs));
+
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
   asm("" ::"s"(start));
 
+  T result{};
   for (auto input : inputs) {
-    auto result = f(input);
+    asm("" ::"v"(input));
+    result = f(input);
+    asm("" ::"v"(result));
+  }
 
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+
+  volatile auto output = result;
+
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
+}
+
+// Provides the *baseline* for throughput with 2 arguments: measures loop and
+// measurement costs without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t throughput_baseline(
+    const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+  asm("" ::"v"(&inputs1), "v"(&inputs2));
+
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
+  asm("" ::"s"(start));
+
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"v"(x), "v"(y));
+    result = x;
     asm("" ::"v"(result));
   }
 
@@ -126,24 +187,31 @@ throughput(F f, const cpp::array<T, N> &inputs) {
   asm("" ::"s"(stop));
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
-  // Return the time elapsed.
+  volatile auto output = result;
+
   return stop - start;
 }
 
 // Provides throughput benchmarking for 2 arguments (e.g. atan2())
 template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
-    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
+                                       const cpp::array<T, N> &inputs2) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs1, inputs2));
+
   asm("" ::"v"(&inputs1), "v"(&inputs2));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
-
   asm("" ::"s"(start));
 
-  for (size_t i = 0; i < inputs1.size(); i++) {
-    auto result = f(inputs1[i], inputs2[i]);
-
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"v"(x), "v"(y));
+    result = f(x, y);
     asm("" ::"v"(result));
   }
 
@@ -151,8 +219,10 @@ template <typename F, typename T, size_t N>
   asm("" ::"s"(stop));
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
-  // Return the time elapsed.
-  return stop - start;
+  volatile auto output = result;
+
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
 }
 
 } // namespace LIBC_NAMESPACE_DECL

diff  --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index a19c16ee4e44d..801080e7a6e98 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -7,6 +7,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.macros.config
     libc.src.__support.macros.attributes
-    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.array
+    libc.src.__support.CPP.type_traits
 )

diff  --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 3c729636367aa..133032ca08423 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 
 #include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
@@ -95,18 +96,47 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   return stop - start;
 }
 
-// Provides throughput benchmarking.
-template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t
-throughput(F f, const cpp::array<T, N> &inputs) {
+// Provides the *baseline* for throughput: measures loop and measurement costs
+// without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t
+throughput_baseline(const cpp::array<T, N> &inputs) {
   asm("" ::"r"(&inputs));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
+  asm("" ::"llr"(start));
+
+  T result{};
+  for (auto input : inputs) {
+    asm("" ::"r"(input));
+    result = input;
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"r"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+
+  volatile auto output = result;
 
+  return stop - start;
+}
+
+// Provides throughput benchmarking
+template <typename F, typename T, size_t N>
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs));
+
+  asm("" ::"r"(&inputs));
+
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
   asm("" ::"llr"(start));
 
-  uint64_t result;
+  T result{};
   for (auto input : inputs) {
     asm("" ::"r"(input));
     result = f(input);
@@ -114,39 +144,77 @@ throughput(F f, const cpp::array<T, N> &inputs) {
   }
 
   uint64_t stop = gpu::processor_clock();
+  asm("" ::"r"(stop));
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+
+  volatile auto output = result;
+
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
+}
+
+// Provides the *baseline* for throughput with 2 arguments: measures loop and
+// measurement costs without calling the f function
+template <typename T, size_t N>
+static LIBC_INLINE uint64_t throughput_baseline(
+    const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+  asm("" ::"r"(&inputs1), "r"(&inputs2));
+
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+  uint64_t start = gpu::processor_clock();
+  asm("" ::"llr"(start));
+
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"r"(x), "r"(y));
+    result = x;
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
   asm("" ::"r"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+
   volatile auto output = result;
 
-  // Return the time elapsed.
   return stop - start;
 }
 
 // Provides throughput benchmarking for 2 arguments (e.g. atan2())
 template <typename F, typename T, size_t N>
-[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
-    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
+                                       const cpp::array<T, N> &inputs2) {
+  uint64_t baseline = UINT64_MAX;
+  for (int i = 0; i < 5; ++i)
+    baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs1, inputs2));
+
   asm("" ::"r"(&inputs1), "r"(&inputs2));
 
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
-
   asm("" ::"llr"(start));
 
-  uint64_t result;
-  for (size_t i = 0; i < inputs1.size(); i++) {
-    result = f(inputs1[i], inputs2[i]);
+  T result{};
+  for (size_t i = 0; i < N; i++) {
+    T x = inputs1[i];
+    T y = inputs2[i];
+    asm("" ::"r"(x), "r"(y));
+    result = f(x, y);
     asm("" ::"r"(result));
   }
 
   uint64_t stop = gpu::processor_clock();
-  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
+
   volatile auto output = result;
 
-  // Return the time elapsed.
-  return stop - start;
+  const uint64_t measured = stop - start;
+  return measured > baseline ? (measured - baseline) : 0;
 }
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX