[libc-commits] [libc] [libc] DRAFT: Add Generic and NVPTX Sin Benchmark (PR #99795)

Sat Jul 27 20:00:31 PDT 2024

https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/99795

>From dbb4f54cb4260bd90219dd6b77e98eaa8fcc510e Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 20 Jul 2024 17:08:55 -0400
Subject: [PATCH 1/9] implement generic sin benchmark and compare with nvsin

---
 libc/benchmarks/gpu/src/CMakeLists.txt        |  1 +
 libc/benchmarks/gpu/src/math/CMakeLists.txt   | 30 ++++++++++++
 .../benchmarks/gpu/src/math/sin_benchmark.cpp | 47 +++++++++++++++++++
 3 files changed, 78 insertions(+)
 create mode 100644 libc/benchmarks/gpu/src/math/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/src/math/sin_benchmark.cpp

diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
index 42eb4f7b5909a..f15d082e4dd2b 100644
--- a/libc/benchmarks/gpu/src/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000000000..b68f728c9e395
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -0,0 +1,30 @@
+add_custom_target(libc-gpu-math-benchmarks)
+
+if(CUDAToolkit_FOUND)
+  set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
+  if (EXISTS ${libdevice_path})
+    set(nvptx_bitcode_link_flags
+        "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
+    # Compile definition needed so the benchmark knows to register
+    # NVPTX benchmarks.
+    set(nvptx_math_found "-DNVPTX_MATH_FOUND=1")
+  endif()
+endif()
+
+add_benchmark(
+  sin_benchmark
+  SUITE
+    libc-gpu-math-benchmarks
+  SRCS
+    sin_benchmark.cpp
+  DEPENDS
+    libc.src.math.sin
+    libc.src.stdlib.srand
+    libc.src.stdlib.rand
+    libc.src.__support.FPUtil.fp_bits
+  COMPILE_OPTIONS
+    ${nvptx_math_found}
+    ${nvptx_bitcode_link_flags}
+  LOADER_ARGS
+    --threads 64
+)
\ No newline at end of file
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
new file mode 100644
index 0000000000000..ab65f182d7778
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -0,0 +1,47 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/sin.h"
+#include "src/stdlib/rand.h"
+#include "src/stdlib/srand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+// We want our values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+const int MAX_EXPONENT = 52;
+
+double get_rand(int max_exponent) {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+  uint64_t bits = LIBC_NAMESPACE::rand();
+  double scale = 0.5 + max_exponent / 2048.0;
+  FPBits fp(bits);
+  fp.set_biased_exponent(
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+  return fp.get_val();
+}
+
+uint64_t BM_Sin() {
+  LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::get_thread_id());
+  double x = get_rand(MAX_EXPONENT);
+  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::sin, x);
+}
+BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_Sin);
+SINGLE_THREADED_BENCHMARK(LlvmLibcSinGpuBenchmark, SinSingleThread, BM_Sin);
+SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, SinSingleWave, BM_Sin);
+
+#ifdef NVPTX_MATH_FOUND
+uint64_t BM_NvSin() {
+  LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::get_thread_id());
+  double x = get_rand(MAX_EXPONENT);
+  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::__nv_sin, x);
+}
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin, BM_NvSin);
+SINGLE_THREADED_BENCHMARK(lvmLibcSinGpuBenchmark, NvSinSingleThread, BM_NvSin);
+SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinSingleWave, BM_NvSin);
+
+#endif

>From 788f615dc238bb4dd1813eec58a7c0b40d0d5ae8 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 20 Jul 2024 19:22:11 -0400
Subject: [PATCH 2/9] basic generic and nvptx sin benchmark

---
 libc/benchmarks/gpu/CMakeLists.txt            |  3 +
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      |  5 +-
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 63 +++++++++++++++++
 libc/benchmarks/gpu/src/math/CMakeLists.txt   |  1 +
 .../benchmarks/gpu/src/math/sin_benchmark.cpp | 70 +++++++++++--------
 5 files changed, 109 insertions(+), 33 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index f1aa06a52584b..656f3d21430cb 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -52,13 +52,16 @@ add_unittest_framework_library(
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.atomic
+    libc.src.__support.CPP.array
     libc.src.__support.fixed_point.fx_rep
     libc.src.__support.macros.properties.types
     libc.src.__support.OSUtil.osutil
     libc.src.__support.uint128
+    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.fixedvector
     libc.src.time.clock
+    libc.src.stdlib.rand
     libc.benchmarks.gpu.timing.timing
     libc.src.stdio.printf
 )
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index a9a912538cd84..8d2637e3bcc10 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -136,9 +136,10 @@ void print_header() {
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
 
-  if (id == 0)
+  if (id == 0) {
+    LIBC_NAMESPACE::benchmarks::init_random_input();
     print_header();
-
+  }
   gpu::sync_threads();
 
   for (Benchmark *b : benchmarks) {
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index ca5ad8a595d54..7774dc84cd2bc 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -3,10 +3,13 @@
 
 #include "benchmarks/gpu/BenchmarkLogger.h"
 #include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdlib/rand.h"
 #include "src/time/clock.h"
 
 #include <stdint.h>
@@ -105,6 +108,66 @@ class Benchmark {
     return benchmark(options, func);
   }
 };
+
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+static constexpr int MAX_EXPONENT = 52;
+static constexpr int RANDOM_INPUT_SIZE = 1024;
+static cpp::array<double, RANDOM_INPUT_SIZE> random_input;
+
+static double get_rand() {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+  uint64_t bits = LIBC_NAMESPACE::rand();
+  double scale = 0.5 + MAX_EXPONENT / 2048.0;
+  FPBits fp(bits);
+  fp.set_biased_exponent(
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+  return fp.get_val();
+}
+
+static void init_random_input() {
+  for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
+    random_input[i] = get_rand();
+  }
+}
+
+template <typename T> class MathPerf {
+  using FPBits = fputil::FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
+  static constexpr StorageType UIntMax =
+      cpp::numeric_limits<StorageType>::max();
+
+public:
+  typedef T Func(T);
+
+  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
+                                    StorageType ending_bit, StorageType step) {
+    uint64_t total_time = 0;
+    if (step <= 0)
+      step = 1;
+    volatile T result;
+    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
+      T x = FPBits(bits).get_val();
+      total_time += LIBC_NAMESPACE::latency(f, x);
+    }
+    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
+
+    return total_time / num_runs;
+  }
+
+  static uint64_t run_perf_normal(Func f) {
+    return run_perf_in_range(f, FPBits::min_normal().uintval(),
+                             FPBits::max_normal().uintval());
+  }
+
+  static uint64_t run_perf_denormal(Func f) {
+    return run_perf_in_range(f, StorageType(0),
+                             FPBits::max_subnormal().uintval());
+  }
+};
+
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index b68f728c9e395..89f698640c791 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -22,6 +22,7 @@ add_benchmark(
     libc.src.stdlib.srand
     libc.src.stdlib.rand
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.CPP.bit
   COMPILE_OPTIONS
     ${nvptx_math_found}
     ${nvptx_bitcode_link_flags}
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index ab65f182d7778..c873f94512d20 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -1,5 +1,7 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/functional.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/sin.h"
 #include "src/stdlib/rand.h"
@@ -9,39 +11,45 @@
 #include "src/math/nvptx/declarations.h"
 #endif
 
-// We want our values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-// The largest integer that can be stored in a double is 2^53
-const int MAX_EXPONENT = 52;
-
-double get_rand(int max_exponent) {
-  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
-  uint64_t bits = LIBC_NAMESPACE::rand();
-  double scale = 0.5 + max_exponent / 2048.0;
-  FPBits fp(bits);
-  fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
-  return fp.get_val();
+constexpr double M_PI = 3.14159265358979323846;
+uint64_t get_bits(double x) {
+  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
 }
 
-uint64_t BM_Sin() {
-  LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::get_thread_id());
-  double x = get_rand(MAX_EXPONENT);
-  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::sin, x);
-}
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_Sin);
-SINGLE_THREADED_BENCHMARK(LlvmLibcSinGpuBenchmark, SinSingleThread, BM_Sin);
-SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, SinSingleWave, BM_Sin);
+// BENCHMARK() expects a function that with no parameters that returns a
+// uint64_t representing the latency. Defining each benchmark as a macro uses a
+// lambda to allow us to switch the implementation of `sin()` to easily register
+// NVPTX benchmarks.
+#define BM_RANDOM_INPUT(Func)                                                  \
+  []() {                                                                       \
+    uint64_t total_time = 0;                                                   \
+    for (double i : LIBC_NAMESPACE::benchmarks::random_input) {                \
+      total_time += LIBC_NAMESPACE::latency(Func, i);                          \
+    }                                                                          \
+    return total_time / LIBC_NAMESPACE::benchmarks::random_input.size();       \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
 
-#ifdef NVPTX_MATH_FOUND
-uint64_t BM_NvSin() {
-  LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::get_thread_id());
-  double x = get_rand(MAX_EXPONENT);
-  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::__nv_sin, x);
-}
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin, BM_NvSin);
-SINGLE_THREADED_BENCHMARK(lvmLibcSinGpuBenchmark, NvSinSingleThread, BM_NvSin);
-SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinSingleWave, BM_NvSin);
+#define BM_TWO_PI(Func)                                                        \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
+        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
+
+#define BM_LARGE_INT(Func)                                                     \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
+        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::sin));
 
+#ifdef NVPTX_MATH_FOUND
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
+          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
+          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
 #endif

>From 7d37fedb529f02c582d91806872363dc2bd06042 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 20 Jul 2024 23:38:23 -0400
Subject: [PATCH 3/9] minor fixes

---
 libc/benchmarks/gpu/LibcGpuBenchmark.h         | 10 ----------
 libc/benchmarks/gpu/src/math/CMakeLists.txt    |  2 +-
 libc/benchmarks/gpu/src/math/sin_benchmark.cpp |  6 +++---
 3 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 7774dc84cd2bc..e84636bc6b0a6 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -156,16 +156,6 @@ template <typename T> class MathPerf {
 
     return total_time / num_runs;
   }
-
-  static uint64_t run_perf_normal(Func f) {
-    return run_perf_in_range(f, FPBits::min_normal().uintval(),
-                             FPBits::max_normal().uintval());
-  }
-
-  static uint64_t run_perf_denormal(Func f) {
-    return run_perf_in_range(f, StorageType(0),
-                             FPBits::max_subnormal().uintval());
-  }
 };
 
 } // namespace benchmarks
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 89f698640c791..2b27652e46ae9 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -28,4 +28,4 @@ add_benchmark(
     ${nvptx_bitcode_link_flags}
   LOADER_ARGS
     --threads 64
-)
\ No newline at end of file
+)
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index c873f94512d20..ac35e22b57287 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -17,9 +17,9 @@ uint64_t get_bits(double x) {
 }
 
 // BENCHMARK() expects a function that with no parameters that returns a
-// uint64_t representing the latency. Defining each benchmark as a macro uses a
-// lambda to allow us to switch the implementation of `sin()` to easily register
-// NVPTX benchmarks.
+// uint64_t representing the latency. Defining each benchmark using macro that
+// expands to a lambda to allow us to switch the implementation of `sin()` to
+// easily register NVPTX benchmarks.
 #define BM_RANDOM_INPUT(Func)                                                  \
   []() {                                                                       \
     uint64_t total_time = 0;                                                   \

>From 3a393040a96e9530fdf740b9f323b22bad1de16e Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 21 Jul 2024 18:33:18 -0400
Subject: [PATCH 4/9] seed rand with processor clock

---
 libc/benchmarks/gpu/CMakeLists.txt            |  3 +++
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      | 24 +++++++++++++++++++
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 21 ----------------
 .../benchmarks/gpu/src/math/sin_benchmark.cpp |  6 ++---
 4 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 656f3d21430cb..5fa3e44e8d48c 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -22,6 +22,8 @@ function(add_benchmark benchmark_name)
       ${BENCHMARK_LINK_LIBRARIES}
     DEPENDS
       libc.src.stdio.printf
+      libc.src.stdlib.srand
+      libc.src.stdlib.rand
       ${BENCHMARK_DEPENDS}
     ${BENCHMARK_UNPARSED_ARGUMENTS}
     COMPILE_OPTIONS
@@ -62,6 +64,7 @@ add_unittest_framework_library(
     libc.src.__support.fixedvector
     libc.src.time.clock
     libc.src.stdlib.rand
+    libc.src.stdlib.srand
     libc.benchmarks.gpu.timing.timing
     libc.src.stdio.printf
 )
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 8d2637e3bcc10..c8a36c22d4e5e 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -8,6 +8,7 @@
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf.h"
+#include "src/stdlib/srand.h"
 #include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -133,6 +134,29 @@ void print_header() {
       "--------------------------------\n");
 }
 
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+static constexpr int MAX_EXPONENT = 52;
+
+static double get_rand() {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+  uint64_t bits = LIBC_NAMESPACE::rand();
+  double scale = 0.5 + MAX_EXPONENT / 2048.0;
+  FPBits fp(bits);
+  fp.set_biased_exponent(
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+  return fp.get_val();
+}
+
+static void init_random_input() {
+  LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock());
+  for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
+    random_input[i] = get_rand();
+  }
+}
+
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index e84636bc6b0a6..7b597bab0def1 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -109,30 +109,9 @@ class Benchmark {
   }
 };
 
-// We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-// The largest integer that can be stored in a double is 2^53
-static constexpr int MAX_EXPONENT = 52;
 static constexpr int RANDOM_INPUT_SIZE = 1024;
 static cpp::array<double, RANDOM_INPUT_SIZE> random_input;
 
-static double get_rand() {
-  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
-  uint64_t bits = LIBC_NAMESPACE::rand();
-  double scale = 0.5 + MAX_EXPONENT / 2048.0;
-  FPBits fp(bits);
-  fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
-  return fp.get_val();
-}
-
-static void init_random_input() {
-  for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
-    random_input[i] = get_rand();
-  }
-}
-
 template <typename T> class MathPerf {
   using FPBits = fputil::FPBits<T>;
   using StorageType = typename FPBits::StorageType;
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index ac35e22b57287..845d9eae5642a 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -16,10 +16,10 @@ uint64_t get_bits(double x) {
   return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
 }
 
-// BENCHMARK() expects a function that with no parameters that returns a
+// BENCHMARK() expects a function with no parameters that returns a
 // uint64_t representing the latency. Defining each benchmark using macro that
-// expands to a lambda to allow us to switch the implementation of `sin()` to
-// easily register NVPTX benchmarks.
+// expands to a lambda to allow us to switch the implementation of `sin()` and
+// easily register vendor-specific benchmarks.
 #define BM_RANDOM_INPUT(Func)                                                  \
   []() {                                                                       \
     uint64_t total_time = 0;                                                   \

>From c3e14be1649fd90b6b628c5c3f2a260bb22db601 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 21 Jul 2024 21:49:10 -0400
Subject: [PATCH 5/9] move random array to per thread

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      | 25 ------------------
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 26 +++++++++++++++++--
 libc/benchmarks/gpu/src/math/CMakeLists.txt   |  1 +
 .../benchmarks/gpu/src/math/sin_benchmark.cpp | 15 +++++++----
 4 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index c8a36c22d4e5e..88a2ccd33873d 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -8,7 +8,6 @@
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf.h"
-#include "src/stdlib/srand.h"
 #include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -134,34 +133,10 @@ void print_header() {
       "--------------------------------\n");
 }
 
-// We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-// The largest integer that can be stored in a double is 2^53
-static constexpr int MAX_EXPONENT = 52;
-
-static double get_rand() {
-  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
-  uint64_t bits = LIBC_NAMESPACE::rand();
-  double scale = 0.5 + MAX_EXPONENT / 2048.0;
-  FPBits fp(bits);
-  fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
-  return fp.get_val();
-}
-
-static void init_random_input() {
-  LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock());
-  for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
-    random_input[i] = get_rand();
-  }
-}
-
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
 
   if (id == 0) {
-    LIBC_NAMESPACE::benchmarks::init_random_input();
     print_header();
   }
   gpu::sync_threads();
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 7b597bab0def1..fbaa0b8c318c3 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -10,6 +10,7 @@
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
 #include "src/stdlib/rand.h"
+#include "src/stdlib/srand.h"
 #include "src/time/clock.h"
 
 #include <stdint.h>
@@ -109,8 +110,29 @@ class Benchmark {
   }
 };
 
-static constexpr int RANDOM_INPUT_SIZE = 1024;
-static cpp::array<double, RANDOM_INPUT_SIZE> random_input;
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+static constexpr int MAX_EXPONENT = 52;
+
+static double get_rand_double() {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+  uint64_t bits = LIBC_NAMESPACE::rand();
+  double scale = 0.5 + MAX_EXPONENT / 2048.0;
+  FPBits fp(bits);
+  fp.set_biased_exponent(
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+  return fp.get_val();
+}
+
+template <size_t Size>
+static void init_random_double_input(cpp::array<double, Size> &values) {
+  LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock());
+  for (int i = 0; i < Size; i++) {
+    values[i] = get_rand_double();
+  }
+}
 
 template <typename T> class MathPerf {
   using FPBits = fputil::FPBits<T>;
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 2b27652e46ae9..116dfb580215a 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -23,6 +23,7 @@ add_benchmark(
     libc.src.stdlib.rand
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.CPP.bit
+    libc.src.__support.CPP.array
   COMPILE_OPTIONS
     ${nvptx_math_found}
     ${nvptx_bitcode_link_flags}
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 845d9eae5642a..f0b218ef047e5 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -1,5 +1,6 @@
 #include "benchmarks/gpu/LibcGpuBenchmark.h"
 
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/functional.h"
 #include "src/__support/FPUtil/FPBits.h"
@@ -16,17 +17,21 @@ uint64_t get_bits(double x) {
   return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
 }
 
-// BENCHMARK() expects a function with no parameters that returns a
+constexpr int RANDOM_INPUT_SIZE = 256;
+
+// BENCHMARK() expects a function that with no parameters that returns a
 // uint64_t representing the latency. Defining each benchmark using macro that
-// expands to a lambda to allow us to switch the implementation of `sin()` and
-// easily register vendor-specific benchmarks.
+// expands to a lambda to allow us to switch the implementation of `sin()` to
+// easily register NVPTX benchmarks.
 #define BM_RANDOM_INPUT(Func)                                                  \
   []() {                                                                       \
+    LIBC_NAMESPACE::cpp::array<double, RANDOM_INPUT_SIZE> random_input;        \
+    LIBC_NAMESPACE::benchmarks::init_random_double_input(random_input);        \
     uint64_t total_time = 0;                                                   \
-    for (double i : LIBC_NAMESPACE::benchmarks::random_input) {                \
+    for (double i : random_input) {                                            \
       total_time += LIBC_NAMESPACE::latency(Func, i);                          \
     }                                                                          \
-    return total_time / LIBC_NAMESPACE::benchmarks::random_input.size();       \
+    return total_time / random_input.size();                                   \
   }
 BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
 

>From 908bdef98412eb4986948444a5eab8e8aa31ffe4 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 21 Jul 2024 21:52:39 -0400
Subject: [PATCH 6/9] use single variable instead of array

---
 libc/benchmarks/gpu/src/math/sin_benchmark.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index f0b218ef047e5..4d0ecea0f7cdd 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -25,13 +25,8 @@ constexpr int RANDOM_INPUT_SIZE = 256;
 // easily register NVPTX benchmarks.
 #define BM_RANDOM_INPUT(Func)                                                  \
   []() {                                                                       \
-    LIBC_NAMESPACE::cpp::array<double, RANDOM_INPUT_SIZE> random_input;        \
-    LIBC_NAMESPACE::benchmarks::init_random_double_input(random_input);        \
-    uint64_t total_time = 0;                                                   \
-    for (double i : random_input) {                                            \
-      total_time += LIBC_NAMESPACE::latency(Func, i);                          \
-    }                                                                          \
-    return total_time / random_input.size();                                   \
+    double x = LIBC_NAMESPACE::benchmarks::get_rand_double();                  \
+    return LIBC_NAMESPACE::latency(Func, x);                                   \
   }
 BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
 

>From 2adc13d741857f36445bb34e742443c6386979ee Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 21 Jul 2024 21:56:51 -0400
Subject: [PATCH 7/9] allow AMDGPU to store doubles to register

---
 libc/benchmarks/gpu/timing/amdgpu/timing.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 9b40f9282b16b..bfba4043c2505 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -67,7 +67,9 @@ template <typename F, typename T>
 
   // This inline assembly performs a no-op which forces the result to both
   // be used and prevents us from exiting this region before it's complete.
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+          static_cast<uint32_t>(result))
+      :);
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -98,7 +100,9 @@ template <typename F, typename T1, typename T2>
 
   auto result = f(arg1, arg2);
 
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+          static_cast<uint32_t>(result))
+      :);
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));

>From a0f1905336c6a1fdbd5fbe0d3900b077db008979 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 27 Jul 2024 22:09:11 -0400
Subject: [PATCH 8/9] minor fixes

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp            | 4 ++--
 libc/benchmarks/gpu/src/math/CMakeLists.txt         | 1 +
 libc/benchmarks/gpu/src/math/sin_benchmark.cpp      | 2 --
 libc/cmake/modules/LLVMLibCCompileOptionRules.cmake | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 88a2ccd33873d..a9a912538cd84 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -136,9 +136,9 @@ void print_header() {
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
 
-  if (id == 0) {
+  if (id == 0)
     print_header();
-  }
+
   gpu::sync_threads();
 
   for (Benchmark *b : benchmarks) {
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 116dfb580215a..15f52b5cfa15c 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -27,6 +27,7 @@ add_benchmark(
   COMPILE_OPTIONS
     ${nvptx_math_found}
     ${nvptx_bitcode_link_flags}
+    -save-temps
   LOADER_ARGS
     --threads 64
 )
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 4d0ecea0f7cdd..94ccf0363cef9 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -17,8 +17,6 @@ uint64_t get_bits(double x) {
   return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
 }
 
-constexpr int RANDOM_INPUT_SIZE = 256;
-
 // BENCHMARK() expects a function that with no parameters that returns a
 // uint64_t representing the latency. Defining each benchmark using macro that
 // expands to a lambda to allow us to switch the implementation of `sin()` to
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 9fc10375a1d37..ce276671f93d1 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -230,7 +230,7 @@ endfunction()
 function(_get_hermetic_test_compile_options output_var flags)
   _get_compile_options_from_flags(compile_flags ${flags})
   list(APPEND compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags}
-       ${flags} -fpie -ffreestanding -fno-exceptions -fno-rtti)
+       ${flags} -fpie -ffreestanding -fno-exceptions -fno-rtti -save-temps)
 
   # The GPU build requires overriding the default CMake triple and architecture.
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)

>From 84c2afb2a7139c3d2f1dac52cd7d771a795ccac7 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 27 Jul 2024 22:58:14 -0400
Subject: [PATCH 9/9] address comments

---
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 24 +++++++------------
 libc/benchmarks/gpu/src/math/CMakeLists.txt   |  1 -
 .../benchmarks/gpu/src/math/sin_benchmark.cpp |  3 ++-
 libc/benchmarks/gpu/timing/amdgpu/timing.h    |  3 +--
 .../modules/LLVMLibCCompileOptionRules.cmake  |  2 +-
 5 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index fbaa0b8c318c3..d79199a947d3a 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -7,6 +7,7 @@
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
 #include "src/stdlib/rand.h"
@@ -113,27 +114,20 @@ class Benchmark {
 // We want our random values to be approximately
 // |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
 // 2^(max_exponent + 1)
-// The largest integer that can be stored in a double is 2^53
-static constexpr int MAX_EXPONENT = 52;
-
-static double get_rand_double() {
-  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
-  uint64_t bits = LIBC_NAMESPACE::rand();
-  double scale = 0.5 + MAX_EXPONENT / 2048.0;
+template <typename T> static T get_rand_input() {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+
+  // Required to correctly instantiate FPBits for floats and doubles.
+  using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
+                                               uint64_t, uint32_t>;
+  RandType bits = LIBC_NAMESPACE::rand();
+  double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
   FPBits fp(bits);
   fp.set_biased_exponent(
       static_cast<uint32_t>(fp.get_biased_exponent() * scale));
   return fp.get_val();
 }
 
-template <size_t Size>
-static void init_random_double_input(cpp::array<double, Size> &values) {
-  LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock());
-  for (int i = 0; i < Size; i++) {
-    values[i] = get_rand_double();
-  }
-}
-
 template <typename T> class MathPerf {
   using FPBits = fputil::FPBits<T>;
   using StorageType = typename FPBits::StorageType;
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 15f52b5cfa15c..116dfb580215a 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -27,7 +27,6 @@ add_benchmark(
   COMPILE_OPTIONS
     ${nvptx_math_found}
     ${nvptx_bitcode_link_flags}
-    -save-temps
   LOADER_ARGS
     --threads 64
 )
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 94ccf0363cef9..ac1834ae0a1ea 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -23,7 +23,8 @@ uint64_t get_bits(double x) {
 // easily register NVPTX benchmarks.
 #define BM_RANDOM_INPUT(Func)                                                  \
   []() {                                                                       \
-    double x = LIBC_NAMESPACE::benchmarks::get_rand_double();                  \
+    LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock());             \
+    double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>();           \
     return LIBC_NAMESPACE::latency(Func, x);                                   \
   }
 BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index bfba4043c2505..ee928af5e6672 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -68,8 +68,7 @@ template <typename F, typename T>
   // This inline assembly performs a no-op which forces the result to both
   // be used and prevents us from exiting this region before it's complete.
   asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-          static_cast<uint32_t>(result))
-      :);
+      static_cast<uint32_t>(result)));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index ce276671f93d1..9fc10375a1d37 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -230,7 +230,7 @@ endfunction()
 function(_get_hermetic_test_compile_options output_var flags)
   _get_compile_options_from_flags(compile_flags ${flags})
   list(APPEND compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags}
-       ${flags} -fpie -ffreestanding -fno-exceptions -fno-rtti -save-temps)
+       ${flags} -fpie -ffreestanding -fno-exceptions -fno-rtti)
 
   # The GPU build requires overriding the default CMake triple and architecture.
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)